acryl-datahub 1.2.0.7rc4__py3-none-any.whl → 1.2.0.8rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/METADATA +2612 -2612
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/RECORD +35 -33
- datahub/_version.py +1 -1
- datahub/cli/delete_cli.py +1 -0
- datahub/ingestion/api/report.py +4 -0
- datahub/ingestion/autogenerated/capability_summary.json +1 -1
- datahub/ingestion/graph/client.py +8 -1
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +6 -1
- datahub/ingestion/source/iceberg/iceberg.py +74 -32
- datahub/ingestion/source/metadata/lineage.py +8 -8
- datahub/ingestion/source/redshift/redshift.py +1 -1
- datahub/ingestion/source/sql/athena.py +95 -18
- datahub/ingestion/source/sql/athena_properties_extractor.py +43 -25
- datahub/ingestion/source/superset.py +3 -2
- datahub/ingestion/source/tableau/tableau.py +8 -5
- datahub/metadata/_internal_schema_classes.py +207 -12
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +6 -0
- datahub/metadata/schema.avsc +160 -12
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -1
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +77 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +0 -3
- datahub/sql_parsing/sqlglot_lineage.py +121 -28
- datahub/sql_parsing/sqlglot_utils.py +12 -1
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.7rc4.dist-info → acryl_datahub-1.2.0.8rc2.dist-info}/top_level.txt +0 -0
|
@@ -543,16 +543,19 @@ def _select_statement_cll(
|
|
|
543
543
|
root_scope: sqlglot.optimizer.Scope,
|
|
544
544
|
column_resolver: _ColumnResolver,
|
|
545
545
|
output_table: Optional[_TableName],
|
|
546
|
+
table_name_schema_mapping: Dict[_TableName, SchemaInfo],
|
|
547
|
+
default_db: Optional[str] = None,
|
|
548
|
+
default_schema: Optional[str] = None,
|
|
546
549
|
) -> List[_ColumnLineageInfo]:
|
|
547
550
|
column_lineage: List[_ColumnLineageInfo] = []
|
|
548
551
|
|
|
549
552
|
try:
|
|
550
|
-
# List output columns.
|
|
551
553
|
output_columns = [
|
|
552
554
|
(select_col.alias_or_name, select_col) for select_col in statement.selects
|
|
553
555
|
]
|
|
554
556
|
logger.debug("output columns: %s", [col[0] for col in output_columns])
|
|
555
|
-
|
|
557
|
+
|
|
558
|
+
for output_col, _original_col_expression in output_columns:
|
|
556
559
|
if not output_col or output_col == "*":
|
|
557
560
|
# If schema information is available, the * will be expanded to the actual columns.
|
|
558
561
|
# Otherwise, we can't process it.
|
|
@@ -576,13 +579,14 @@ def _select_statement_cll(
|
|
|
576
579
|
trim_selects=False,
|
|
577
580
|
# We don't need to pass the schema in here, since we've already qualified the columns.
|
|
578
581
|
)
|
|
579
|
-
# import pathlib
|
|
580
|
-
# pathlib.Path("sqlglot.html").write_text(
|
|
581
|
-
# str(lineage_node.to_html(dialect=dialect))
|
|
582
|
-
# )
|
|
583
582
|
|
|
584
583
|
# Generate SELECT lineage.
|
|
585
|
-
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
584
|
+
direct_raw_col_upstreams = _get_direct_raw_col_upstreams(
|
|
585
|
+
lineage_node,
|
|
586
|
+
dialect,
|
|
587
|
+
default_db,
|
|
588
|
+
default_schema,
|
|
589
|
+
)
|
|
586
590
|
|
|
587
591
|
# Fuzzy resolve the output column.
|
|
588
592
|
original_col_expression = lineage_node.expression
|
|
@@ -601,7 +605,7 @@ def _select_statement_cll(
|
|
|
601
605
|
if original_col_expression.type:
|
|
602
606
|
output_col_type = original_col_expression.type
|
|
603
607
|
|
|
604
|
-
#
|
|
608
|
+
# Resolve upstream columns - table names should already be qualified from placeholder processing
|
|
605
609
|
direct_resolved_col_upstreams = {
|
|
606
610
|
_ColumnRef(
|
|
607
611
|
table=edge.table,
|
|
@@ -706,6 +710,9 @@ def _column_level_lineage(
|
|
|
706
710
|
root_scope=root_scope,
|
|
707
711
|
column_resolver=column_resolver,
|
|
708
712
|
output_table=downstream_table,
|
|
713
|
+
table_name_schema_mapping=table_name_schema_mapping,
|
|
714
|
+
default_db=default_db,
|
|
715
|
+
default_schema=default_schema,
|
|
709
716
|
)
|
|
710
717
|
|
|
711
718
|
joins: Optional[List[_JoinInfo]] = None
|
|
@@ -726,6 +733,9 @@ def _column_level_lineage(
|
|
|
726
733
|
|
|
727
734
|
def _get_direct_raw_col_upstreams(
|
|
728
735
|
lineage_node: sqlglot.lineage.Node,
|
|
736
|
+
dialect: Optional[sqlglot.Dialect] = None,
|
|
737
|
+
default_db: Optional[str] = None,
|
|
738
|
+
default_schema: Optional[str] = None,
|
|
729
739
|
) -> OrderedSet[_ColumnRef]:
|
|
730
740
|
# Using an OrderedSet here to deduplicate upstreams while preserving "discovery" order.
|
|
731
741
|
direct_raw_col_upstreams: OrderedSet[_ColumnRef] = OrderedSet()
|
|
@@ -755,6 +765,53 @@ def _get_direct_raw_col_upstreams(
|
|
|
755
765
|
direct_raw_col_upstreams.add(
|
|
756
766
|
_ColumnRef(table=table_ref, column=normalized_col)
|
|
757
767
|
)
|
|
768
|
+
elif isinstance(node.expression, sqlglot.exp.Placeholder) and node.name != "*":
|
|
769
|
+
# Handle placeholder expressions from lateral joins.
|
|
770
|
+
#
|
|
771
|
+
# In newer SQLGlot versions, columns from lateral subqueries appear as sqlglot.exp.Placeholder
|
|
772
|
+
# expressions instead of regular table references. This is critical for lateral join column lineage.
|
|
773
|
+
#
|
|
774
|
+
# Example: In "SELECT t2.value FROM t1, LATERAL (SELECT value FROM t2 WHERE t1.id = t2.id) t2"
|
|
775
|
+
# The "t2.value" column reference creates a placeholder with name like '"my_table2"."value"'
|
|
776
|
+
# which we need to parse to establish the lineage: output.value <- my_table2.value
|
|
777
|
+
#
|
|
778
|
+
# Without this handling, lateral join column lineage would be incomplete/missing.
|
|
779
|
+
try:
|
|
780
|
+
parsed = sqlglot.parse_one(node.name, dialect=dialect)
|
|
781
|
+
if isinstance(parsed, sqlglot.exp.Column) and parsed.table:
|
|
782
|
+
table_ref = _TableName.from_sqlglot_table(
|
|
783
|
+
sqlglot.parse_one(
|
|
784
|
+
parsed.table, into=sqlglot.exp.Table, dialect=dialect
|
|
785
|
+
)
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# SQLGlot's qualification process doesn't fully qualify placeholder names from lateral joins.
|
|
789
|
+
# Even after statement-level qualification, these placeholders remain unqualified (e.g., "t2.value").
|
|
790
|
+
# We need this runtime qualification to ensure proper lineage resolution.
|
|
791
|
+
# Only qualify if this appears to be a real table reference (not a temporary construct)
|
|
792
|
+
if (
|
|
793
|
+
not (table_ref.database or table_ref.db_schema)
|
|
794
|
+
and dialect is not None
|
|
795
|
+
):
|
|
796
|
+
table_ref = table_ref.qualified(
|
|
797
|
+
dialect=dialect,
|
|
798
|
+
default_db=default_db,
|
|
799
|
+
default_schema=default_schema,
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Extract column name using proper isinstance check
|
|
803
|
+
if isinstance(parsed.this, sqlglot.exp.Identifier):
|
|
804
|
+
column_name = parsed.this.name
|
|
805
|
+
else:
|
|
806
|
+
column_name = str(parsed.this)
|
|
807
|
+
direct_raw_col_upstreams.add(
|
|
808
|
+
_ColumnRef(table=table_ref, column=column_name)
|
|
809
|
+
)
|
|
810
|
+
except Exception as e:
|
|
811
|
+
logger.debug(
|
|
812
|
+
f"Failed to parse placeholder column expression: {node.name} with dialect {dialect}. The exception was: {e}",
|
|
813
|
+
exc_info=True,
|
|
814
|
+
)
|
|
758
815
|
else:
|
|
759
816
|
# This branch doesn't matter. For example, a count(*) column would go here, and
|
|
760
817
|
# we don't get any column-level lineage for that.
|
|
@@ -857,7 +914,7 @@ def _get_raw_col_upstreams_for_expression(
|
|
|
857
914
|
trim_selects=False,
|
|
858
915
|
)
|
|
859
916
|
|
|
860
|
-
return _get_direct_raw_col_upstreams(node)
|
|
917
|
+
return _get_direct_raw_col_upstreams(node, dialect, None, None)
|
|
861
918
|
finally:
|
|
862
919
|
scope.expression = original_expression
|
|
863
920
|
|
|
@@ -872,8 +929,9 @@ def _list_joins(
|
|
|
872
929
|
|
|
873
930
|
scope: sqlglot.optimizer.Scope
|
|
874
931
|
for scope in root_scope.traverse():
|
|
932
|
+
# PART 1: Handle regular explicit JOINs (updated API)
|
|
875
933
|
join: sqlglot.exp.Join
|
|
876
|
-
for join in scope.find_all(sqlglot.exp.Join):
|
|
934
|
+
for join in scope.expression.find_all(sqlglot.exp.Join):
|
|
877
935
|
left_side_tables: OrderedSet[_TableName] = OrderedSet()
|
|
878
936
|
from_clause: sqlglot.exp.From
|
|
879
937
|
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
@@ -949,6 +1007,36 @@ def _list_joins(
|
|
|
949
1007
|
)
|
|
950
1008
|
)
|
|
951
1009
|
|
|
1010
|
+
# Handle LATERAL constructs
|
|
1011
|
+
for lateral in scope.expression.find_all(sqlglot.exp.Lateral):
|
|
1012
|
+
# Get tables from non-lateral FROM clauses
|
|
1013
|
+
qualified_left: OrderedSet[_TableName] = OrderedSet()
|
|
1014
|
+
for from_clause in scope.find_all(sqlglot.exp.From):
|
|
1015
|
+
if not isinstance(from_clause.this, sqlglot.exp.Lateral):
|
|
1016
|
+
qualified_left.update(
|
|
1017
|
+
_get_join_side_tables(from_clause.this, dialect, scope)
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# Get tables from lateral subquery
|
|
1021
|
+
qualified_right: OrderedSet[_TableName] = OrderedSet()
|
|
1022
|
+
if lateral.this and isinstance(lateral.this, sqlglot.exp.Subquery):
|
|
1023
|
+
qualified_right.update(
|
|
1024
|
+
_TableName.from_sqlglot_table(t)
|
|
1025
|
+
for t in lateral.this.find_all(sqlglot.exp.Table)
|
|
1026
|
+
)
|
|
1027
|
+
qualified_right.update(qualified_left)
|
|
1028
|
+
|
|
1029
|
+
if qualified_left and qualified_right:
|
|
1030
|
+
joins.append(
|
|
1031
|
+
_JoinInfo(
|
|
1032
|
+
join_type="LATERAL JOIN",
|
|
1033
|
+
left_tables=list(qualified_left),
|
|
1034
|
+
right_tables=list(qualified_right),
|
|
1035
|
+
on_clause=None,
|
|
1036
|
+
columns_involved=[],
|
|
1037
|
+
)
|
|
1038
|
+
)
|
|
1039
|
+
|
|
952
1040
|
return joins
|
|
953
1041
|
|
|
954
1042
|
|
|
@@ -1186,25 +1274,30 @@ def _translate_internal_joins(
|
|
|
1186
1274
|
) -> List[JoinInfo]:
|
|
1187
1275
|
joins = []
|
|
1188
1276
|
for raw_join in raw_joins:
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1277
|
+
try:
|
|
1278
|
+
joins.append(
|
|
1279
|
+
JoinInfo(
|
|
1280
|
+
join_type=raw_join.join_type,
|
|
1281
|
+
left_tables=[
|
|
1282
|
+
table_name_urn_mapping[table] for table in raw_join.left_tables
|
|
1283
|
+
],
|
|
1284
|
+
right_tables=[
|
|
1285
|
+
table_name_urn_mapping[table] for table in raw_join.right_tables
|
|
1286
|
+
],
|
|
1287
|
+
on_clause=raw_join.on_clause,
|
|
1288
|
+
columns_involved=[
|
|
1289
|
+
ColumnRef(
|
|
1290
|
+
table=table_name_urn_mapping[col.table],
|
|
1291
|
+
column=col.column,
|
|
1292
|
+
)
|
|
1293
|
+
for col in raw_join.columns_involved
|
|
1294
|
+
],
|
|
1295
|
+
)
|
|
1206
1296
|
)
|
|
1207
|
-
|
|
1297
|
+
except KeyError as e:
|
|
1298
|
+
# Skip joins that reference tables we can't resolve (e.g., from CTE subqueries)
|
|
1299
|
+
logger.debug(f"Skipping join with unresolvable table: {e}")
|
|
1300
|
+
continue
|
|
1208
1301
|
return joins
|
|
1209
1302
|
|
|
1210
1303
|
|
|
@@ -115,6 +115,8 @@ def _expression_to_string(
|
|
|
115
115
|
return expression.sql(dialect=get_dialect(platform))
|
|
116
116
|
|
|
117
117
|
|
|
118
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION = re.compile(r"(%s|\$\d|\?)")
|
|
119
|
+
|
|
118
120
|
_BASIC_NORMALIZATION_RULES = {
|
|
119
121
|
# Remove /* */ comments.
|
|
120
122
|
re.compile(r"/\*.*?\*/", re.DOTALL): "",
|
|
@@ -130,7 +132,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
130
132
|
re.compile(r"'[^']*'"): "?",
|
|
131
133
|
# Replace sequences of IN/VALUES with a single placeholder.
|
|
132
134
|
# The r" ?" makes it more robust to uneven spacing.
|
|
133
|
-
re.compile(
|
|
135
|
+
re.compile(
|
|
136
|
+
r"\b(IN|VALUES)\s*\( ?(?:%s|\$\d|\?)(?:, ?(?:%s|\$\d|\?))* ?\)", re.IGNORECASE
|
|
137
|
+
): r"\1 (?)",
|
|
134
138
|
# Normalize parenthesis spacing.
|
|
135
139
|
re.compile(r"\( "): "(",
|
|
136
140
|
re.compile(r" \)"): ")",
|
|
@@ -139,6 +143,9 @@ _BASIC_NORMALIZATION_RULES = {
|
|
|
139
143
|
# e.g. "col1,col2" -> "col1, col2"
|
|
140
144
|
re.compile(r"\b ,"): ",",
|
|
141
145
|
re.compile(r"\b,\b"): ", ",
|
|
146
|
+
# MAKE SURE THAT THIS IS AFTER THE ABOVE REPLACEMENT
|
|
147
|
+
# Replace all versions of placeholders with generic ? placeholder.
|
|
148
|
+
PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION: "?",
|
|
142
149
|
}
|
|
143
150
|
_TABLE_NAME_NORMALIZATION_RULES = {
|
|
144
151
|
# Replace UUID-like strings with a placeholder (both - and _ variants).
|
|
@@ -262,6 +269,10 @@ def get_query_fingerprint_debug(
|
|
|
262
269
|
if not fast:
|
|
263
270
|
dialect = get_dialect(platform)
|
|
264
271
|
expression_sql = generalize_query(expression, dialect=dialect)
|
|
272
|
+
# Normalize placeholders for consistent fingerprinting -> this only needs to be backward compatible with earlier sqglot generated generalized queries where the placeholders were always ?
|
|
273
|
+
expression_sql = PLACEHOLDER_BACKWARD_FINGERPRINT_NORMALIZATION.sub(
|
|
274
|
+
"?", expression_sql
|
|
275
|
+
)
|
|
265
276
|
else:
|
|
266
277
|
expression_sql = generalize_query_fast(expression, dialect=platform)
|
|
267
278
|
except (ValueError, sqlglot.errors.SqlglotError) as e:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|