acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/METADATA +2631 -2631
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/RECORD +41 -39
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/external/external_entities.py +500 -15
- datahub/ingestion/source/aws/glue.py +18 -14
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/tag_entities.py +82 -104
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/hex/api.py +2 -0
- datahub/ingestion/source/hex/mapper.py +16 -2
- datahub/ingestion/source/hex/model.py +2 -0
- datahub/ingestion/source/looker/looker_common.py +26 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
- datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/mssql/source.py +2 -25
- datahub/ingestion/source/sql/mysql.py +54 -0
- datahub/ingestion/source/sql/postgres.py +5 -134
- datahub/ingestion/source/sql/sql_common.py +137 -0
- datahub/ingestion/source/superset.py +140 -56
- datahub/ingestion/source/unity/config.py +11 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +20 -6
- datahub/ingestion/source/unity/report.py +9 -1
- datahub/ingestion/source/unity/source.py +51 -16
- datahub/ingestion/source/unity/tag_entities.py +49 -147
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +4 -2
- datahub/metadata/schemas/Operation.avsc +4 -2
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/top_level.txt +0 -0
|
@@ -36,14 +36,8 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
36
36
|
register_custom_type,
|
|
37
37
|
)
|
|
38
38
|
from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
|
|
39
|
-
from datahub.ingestion.source.sql.sql_utils import (
|
|
40
|
-
gen_database_key,
|
|
41
|
-
gen_schema_key,
|
|
42
|
-
)
|
|
43
39
|
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
44
40
|
BaseProcedure,
|
|
45
|
-
generate_procedure_container_workunits,
|
|
46
|
-
generate_procedure_workunits,
|
|
47
41
|
)
|
|
48
42
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
49
43
|
ArrayTypeClass,
|
|
@@ -132,10 +126,12 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
132
126
|
"Note: this is not used if `database` or `sqlalchemy_uri` are provided."
|
|
133
127
|
),
|
|
134
128
|
)
|
|
129
|
+
|
|
135
130
|
include_stored_procedures: bool = Field(
|
|
136
131
|
default=True,
|
|
137
132
|
description="Include ingest of stored procedures.",
|
|
138
133
|
)
|
|
134
|
+
|
|
139
135
|
procedure_pattern: AllowDenyPattern = Field(
|
|
140
136
|
default=AllowDenyPattern.allow_all(),
|
|
141
137
|
description="Regex patterns for stored procedures to filter in ingestion."
|
|
@@ -310,73 +306,6 @@ class PostgresSource(SQLAlchemySource):
|
|
|
310
306
|
except Exception as e:
|
|
311
307
|
logger.error(f"failed to fetch profile metadata: {e}")
|
|
312
308
|
|
|
313
|
-
def get_schema_level_workunits(
|
|
314
|
-
self,
|
|
315
|
-
inspector: Inspector,
|
|
316
|
-
schema: str,
|
|
317
|
-
database: str,
|
|
318
|
-
) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
319
|
-
yield from super().get_schema_level_workunits(
|
|
320
|
-
inspector=inspector,
|
|
321
|
-
schema=schema,
|
|
322
|
-
database=database,
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
if self.config.include_stored_procedures:
|
|
326
|
-
try:
|
|
327
|
-
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
328
|
-
except Exception as e:
|
|
329
|
-
self.report.failure(
|
|
330
|
-
title="Failed to list stored procedures for schema",
|
|
331
|
-
message="An error occurred while listing procedures for the schema.",
|
|
332
|
-
context=f"{database}.{schema}",
|
|
333
|
-
exc=e,
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
def loop_stored_procedures(
|
|
337
|
-
self,
|
|
338
|
-
inspector: Inspector,
|
|
339
|
-
schema: str,
|
|
340
|
-
config: PostgresConfig,
|
|
341
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
342
|
-
"""
|
|
343
|
-
Loop schema data for get stored procedures as dataJob-s.
|
|
344
|
-
"""
|
|
345
|
-
db_name = self.get_db_name(inspector)
|
|
346
|
-
|
|
347
|
-
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
348
|
-
if procedures:
|
|
349
|
-
yield from self._process_procedures(procedures, db_name, schema)
|
|
350
|
-
|
|
351
|
-
def fetch_procedures_for_schema(
|
|
352
|
-
self, inspector: Inspector, schema: str, db_name: str
|
|
353
|
-
) -> List[BaseProcedure]:
|
|
354
|
-
try:
|
|
355
|
-
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
356
|
-
inspector, schema, db_name
|
|
357
|
-
)
|
|
358
|
-
procedures: List[BaseProcedure] = []
|
|
359
|
-
for procedure in raw_procedures:
|
|
360
|
-
procedure_qualified_name = self.get_identifier(
|
|
361
|
-
schema=schema,
|
|
362
|
-
entity=procedure.name,
|
|
363
|
-
inspector=inspector,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
if not self.config.procedure_pattern.allowed(procedure_qualified_name):
|
|
367
|
-
self.report.report_dropped(procedure_qualified_name)
|
|
368
|
-
else:
|
|
369
|
-
procedures.append(procedure)
|
|
370
|
-
return procedures
|
|
371
|
-
except Exception as e:
|
|
372
|
-
self.report.warning(
|
|
373
|
-
title="Failed to get procedures for schema",
|
|
374
|
-
message="An error occurred while fetching procedures for the schema.",
|
|
375
|
-
context=f"{db_name}.{schema}",
|
|
376
|
-
exc=e,
|
|
377
|
-
)
|
|
378
|
-
return []
|
|
379
|
-
|
|
380
309
|
def get_procedures_for_schema(
|
|
381
310
|
self, inspector: Inspector, schema: str, db_name: str
|
|
382
311
|
) -> List[BaseProcedure]:
|
|
@@ -401,10 +330,9 @@ class PostgresSource(SQLAlchemySource):
|
|
|
401
330
|
pg_language l ON l.oid = p.prolang
|
|
402
331
|
WHERE
|
|
403
332
|
p.prokind = 'p'
|
|
404
|
-
AND n.nspname =
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
"""
|
|
333
|
+
AND n.nspname = %s;
|
|
334
|
+
""",
|
|
335
|
+
(schema,),
|
|
408
336
|
)
|
|
409
337
|
|
|
410
338
|
procedure_rows = list(procedures)
|
|
@@ -423,60 +351,3 @@ class PostgresSource(SQLAlchemySource):
|
|
|
423
351
|
)
|
|
424
352
|
)
|
|
425
353
|
return base_procedures
|
|
426
|
-
|
|
427
|
-
def _process_procedures(
|
|
428
|
-
self,
|
|
429
|
-
procedures: List[BaseProcedure],
|
|
430
|
-
db_name: str,
|
|
431
|
-
schema: str,
|
|
432
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
433
|
-
if procedures:
|
|
434
|
-
yield from generate_procedure_container_workunits(
|
|
435
|
-
database_key=gen_database_key(
|
|
436
|
-
database=db_name,
|
|
437
|
-
platform=self.platform,
|
|
438
|
-
platform_instance=self.config.platform_instance,
|
|
439
|
-
env=self.config.env,
|
|
440
|
-
),
|
|
441
|
-
schema_key=gen_schema_key(
|
|
442
|
-
db_name=db_name,
|
|
443
|
-
schema=schema,
|
|
444
|
-
platform=self.platform,
|
|
445
|
-
platform_instance=self.config.platform_instance,
|
|
446
|
-
env=self.config.env,
|
|
447
|
-
),
|
|
448
|
-
)
|
|
449
|
-
for procedure in procedures:
|
|
450
|
-
yield from self._process_procedure(procedure, schema, db_name)
|
|
451
|
-
|
|
452
|
-
def _process_procedure(
|
|
453
|
-
self,
|
|
454
|
-
procedure: BaseProcedure,
|
|
455
|
-
schema: str,
|
|
456
|
-
db_name: str,
|
|
457
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
458
|
-
try:
|
|
459
|
-
yield from generate_procedure_workunits(
|
|
460
|
-
procedure=procedure,
|
|
461
|
-
database_key=gen_database_key(
|
|
462
|
-
database=db_name,
|
|
463
|
-
platform=self.platform,
|
|
464
|
-
platform_instance=self.config.platform_instance,
|
|
465
|
-
env=self.config.env,
|
|
466
|
-
),
|
|
467
|
-
schema_key=gen_schema_key(
|
|
468
|
-
db_name=db_name,
|
|
469
|
-
schema=schema,
|
|
470
|
-
platform=self.platform,
|
|
471
|
-
platform_instance=self.config.platform_instance,
|
|
472
|
-
env=self.config.env,
|
|
473
|
-
),
|
|
474
|
-
schema_resolver=self.get_schema_resolver(),
|
|
475
|
-
)
|
|
476
|
-
except Exception as e:
|
|
477
|
-
self.report.warning(
|
|
478
|
-
title="Failed to emit stored procedure",
|
|
479
|
-
message="An error occurred while emitting stored procedure",
|
|
480
|
-
context=procedure.name,
|
|
481
|
-
exc=e,
|
|
482
|
-
)
|
|
@@ -27,6 +27,7 @@ from sqlalchemy.exc import ProgrammingError
|
|
|
27
27
|
from sqlalchemy.sql import sqltypes as types
|
|
28
28
|
from sqlalchemy.types import TypeDecorator, TypeEngine
|
|
29
29
|
|
|
30
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
30
31
|
from datahub.emitter.mce_builder import (
|
|
31
32
|
make_data_platform_urn,
|
|
32
33
|
make_dataplatform_instance_urn,
|
|
@@ -71,6 +72,11 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
71
72
|
from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
|
|
72
73
|
SqlAlchemyTableDataReader,
|
|
73
74
|
)
|
|
75
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
76
|
+
BaseProcedure,
|
|
77
|
+
generate_procedure_container_workunits,
|
|
78
|
+
generate_procedure_workunits,
|
|
79
|
+
)
|
|
74
80
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
75
81
|
StaleEntityRemovalHandler,
|
|
76
82
|
)
|
|
@@ -531,6 +537,24 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
531
537
|
if self.config.include_views:
|
|
532
538
|
yield from self.loop_views(inspector, schema, self.config)
|
|
533
539
|
|
|
540
|
+
if getattr(self.config, "include_stored_procedures", False):
|
|
541
|
+
try:
|
|
542
|
+
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
543
|
+
except NotImplementedError as e:
|
|
544
|
+
self.report.warning(
|
|
545
|
+
title="Stored procedures not supported",
|
|
546
|
+
message="The current SQL dialect does not support stored procedures.",
|
|
547
|
+
context=f"{database}.{schema}",
|
|
548
|
+
exc=e,
|
|
549
|
+
)
|
|
550
|
+
except Exception as e:
|
|
551
|
+
self.report.failure(
|
|
552
|
+
title="Failed to list stored procedures for schema",
|
|
553
|
+
message="An error occurred while listing procedures for the schema.",
|
|
554
|
+
context=f"{database}.{schema}",
|
|
555
|
+
exc=e,
|
|
556
|
+
)
|
|
557
|
+
|
|
534
558
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
535
559
|
return [
|
|
536
560
|
*super().get_workunit_processors(),
|
|
@@ -1437,3 +1461,116 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1437
1461
|
|
|
1438
1462
|
def get_report(self):
|
|
1439
1463
|
return self.report
|
|
1464
|
+
|
|
1465
|
+
def loop_stored_procedures(
|
|
1466
|
+
self,
|
|
1467
|
+
inspector: Inspector,
|
|
1468
|
+
schema: str,
|
|
1469
|
+
config: Union[SQLCommonConfig, Type[SQLCommonConfig]],
|
|
1470
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1471
|
+
"""
|
|
1472
|
+
Loop schema data for get stored procedures as dataJob-s.
|
|
1473
|
+
"""
|
|
1474
|
+
db_name = self.get_db_name(inspector)
|
|
1475
|
+
|
|
1476
|
+
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
1477
|
+
if procedures:
|
|
1478
|
+
yield from self._process_procedures(procedures, db_name, schema)
|
|
1479
|
+
|
|
1480
|
+
def fetch_procedures_for_schema(
|
|
1481
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1482
|
+
) -> List[BaseProcedure]:
|
|
1483
|
+
try:
|
|
1484
|
+
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
1485
|
+
inspector, schema, db_name
|
|
1486
|
+
)
|
|
1487
|
+
procedures: List[BaseProcedure] = []
|
|
1488
|
+
for procedure in raw_procedures:
|
|
1489
|
+
procedure_qualified_name = self.get_identifier(
|
|
1490
|
+
schema=schema,
|
|
1491
|
+
entity=procedure.name,
|
|
1492
|
+
inspector=inspector,
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
procedure_pattern = getattr(
|
|
1496
|
+
self.config, "procedure_pattern", AllowDenyPattern.allow_all()
|
|
1497
|
+
)
|
|
1498
|
+
if not procedure_pattern.allowed(procedure_qualified_name):
|
|
1499
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1500
|
+
else:
|
|
1501
|
+
procedures.append(procedure)
|
|
1502
|
+
return procedures
|
|
1503
|
+
except NotImplementedError:
|
|
1504
|
+
raise
|
|
1505
|
+
except Exception as e:
|
|
1506
|
+
self.report.warning(
|
|
1507
|
+
title="Failed to get procedures for schema",
|
|
1508
|
+
message="An error occurred while fetching procedures for the schema.",
|
|
1509
|
+
context=f"{db_name}.{schema}",
|
|
1510
|
+
exc=e,
|
|
1511
|
+
)
|
|
1512
|
+
return []
|
|
1513
|
+
|
|
1514
|
+
def get_procedures_for_schema(
|
|
1515
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
1516
|
+
) -> List[BaseProcedure]:
|
|
1517
|
+
raise NotImplementedError(
|
|
1518
|
+
"Subclasses must implement the 'get_procedures_for_schema' method."
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1521
|
+
def _process_procedures(
|
|
1522
|
+
self,
|
|
1523
|
+
procedures: List[BaseProcedure],
|
|
1524
|
+
db_name: str,
|
|
1525
|
+
schema: str,
|
|
1526
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1527
|
+
if procedures:
|
|
1528
|
+
yield from generate_procedure_container_workunits(
|
|
1529
|
+
database_key=gen_database_key(
|
|
1530
|
+
database=db_name,
|
|
1531
|
+
platform=self.platform,
|
|
1532
|
+
platform_instance=self.config.platform_instance,
|
|
1533
|
+
env=self.config.env,
|
|
1534
|
+
),
|
|
1535
|
+
schema_key=gen_schema_key(
|
|
1536
|
+
db_name=db_name,
|
|
1537
|
+
schema=schema,
|
|
1538
|
+
platform=self.platform,
|
|
1539
|
+
platform_instance=self.config.platform_instance,
|
|
1540
|
+
env=self.config.env,
|
|
1541
|
+
),
|
|
1542
|
+
)
|
|
1543
|
+
for procedure in procedures:
|
|
1544
|
+
yield from self._process_procedure(procedure, schema, db_name)
|
|
1545
|
+
|
|
1546
|
+
def _process_procedure(
|
|
1547
|
+
self,
|
|
1548
|
+
procedure: BaseProcedure,
|
|
1549
|
+
schema: str,
|
|
1550
|
+
db_name: str,
|
|
1551
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1552
|
+
try:
|
|
1553
|
+
yield from generate_procedure_workunits(
|
|
1554
|
+
procedure=procedure,
|
|
1555
|
+
database_key=gen_database_key(
|
|
1556
|
+
database=db_name,
|
|
1557
|
+
platform=self.platform,
|
|
1558
|
+
platform_instance=self.config.platform_instance,
|
|
1559
|
+
env=self.config.env,
|
|
1560
|
+
),
|
|
1561
|
+
schema_key=gen_schema_key(
|
|
1562
|
+
db_name=db_name,
|
|
1563
|
+
schema=schema,
|
|
1564
|
+
platform=self.platform,
|
|
1565
|
+
platform_instance=self.config.platform_instance,
|
|
1566
|
+
env=self.config.env,
|
|
1567
|
+
),
|
|
1568
|
+
schema_resolver=self.get_schema_resolver(),
|
|
1569
|
+
)
|
|
1570
|
+
except Exception as e:
|
|
1571
|
+
self.report.warning(
|
|
1572
|
+
title="Failed to emit stored procedure",
|
|
1573
|
+
message="An error occurred while emitting stored procedure",
|
|
1574
|
+
context=procedure.name,
|
|
1575
|
+
exc=e,
|
|
1576
|
+
)
|
|
@@ -8,6 +8,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
8
8
|
|
|
9
9
|
import dateutil.parser as dp
|
|
10
10
|
import requests
|
|
11
|
+
import sqlglot
|
|
11
12
|
from pydantic import BaseModel
|
|
12
13
|
from pydantic.class_validators import root_validator, validator
|
|
13
14
|
from pydantic.fields import Field
|
|
@@ -75,6 +76,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
75
76
|
SchemaFieldDataType,
|
|
76
77
|
SchemaMetadata,
|
|
77
78
|
StringTypeClass,
|
|
79
|
+
TimeTypeClass,
|
|
78
80
|
)
|
|
79
81
|
from datahub.metadata.schema_classes import (
|
|
80
82
|
AuditStampClass,
|
|
@@ -131,8 +133,11 @@ FIELD_TYPE_MAPPING = {
|
|
|
131
133
|
"STRING": StringTypeClass,
|
|
132
134
|
"FLOAT": NumberTypeClass,
|
|
133
135
|
"DATETIME": DateTypeClass,
|
|
136
|
+
"TIMESTAMP": TimeTypeClass,
|
|
134
137
|
"BOOLEAN": BooleanTypeClass,
|
|
135
138
|
"SQL": StringTypeClass,
|
|
139
|
+
"NUMERIC": NumberTypeClass,
|
|
140
|
+
"TEXT": StringTypeClass,
|
|
136
141
|
}
|
|
137
142
|
|
|
138
143
|
|
|
@@ -633,74 +638,130 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
633
638
|
|
|
634
639
|
return input_fields
|
|
635
640
|
|
|
636
|
-
def
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
datasource_urn: Union[str, None],
|
|
640
|
-
datasource_id: Union[Any, int],
|
|
641
|
-
) -> List[InputField]:
|
|
642
|
-
column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
|
|
643
|
-
"all_columns", []
|
|
644
|
-
)
|
|
641
|
+
def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
|
|
642
|
+
if not sql_expr:
|
|
643
|
+
return []
|
|
645
644
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
chart_column_data: List[Tuple[str, bool]] = [
|
|
649
|
-
(column, False)
|
|
650
|
-
if isinstance(column, str)
|
|
651
|
-
else (column.get("label", ""), True)
|
|
652
|
-
for column in column_data
|
|
653
|
-
]
|
|
645
|
+
try:
|
|
646
|
+
parsed_expr = sqlglot.parse_one(sql_expr)
|
|
654
647
|
|
|
655
|
-
|
|
648
|
+
column_refs = set()
|
|
649
|
+
for node in parsed_expr.walk():
|
|
650
|
+
if isinstance(node, sqlglot.exp.Column):
|
|
651
|
+
column_name = node.name
|
|
652
|
+
column_refs.add(column_name)
|
|
656
653
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
dataset_metric_info = dataset_info.get("metrics", [])
|
|
654
|
+
return list(column_refs)
|
|
655
|
+
except Exception as e:
|
|
656
|
+
self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
|
|
657
|
+
return []
|
|
662
658
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
659
|
+
def _process_column_item(
|
|
660
|
+
self, item: Union[str, dict], unique_columns: Dict[str, bool]
|
|
661
|
+
) -> None:
|
|
662
|
+
"""Process a single column item and add to unique_columns."""
|
|
667
663
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
664
|
+
def add_column(col_name: str, is_sql: bool) -> None:
|
|
665
|
+
if not col_name:
|
|
666
|
+
return
|
|
667
|
+
# Always set to False if any non-SQL seen, else keep as is_sql
|
|
668
|
+
unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
|
|
669
|
+
|
|
670
|
+
if isinstance(item, str):
|
|
671
|
+
add_column(item, False)
|
|
672
|
+
elif isinstance(item, dict):
|
|
673
|
+
if item.get("expressionType") == "SIMPLE":
|
|
674
|
+
# For metrics with SIMPLE expression type
|
|
675
|
+
add_column(item.get("column", {}).get("column_name", ""), False)
|
|
676
|
+
elif item.get("expressionType") == "SQL":
|
|
677
|
+
sql_expr = item.get("sqlExpression")
|
|
678
|
+
column_refs = self._extract_columns_from_sql(sql_expr)
|
|
679
|
+
for col in column_refs:
|
|
680
|
+
add_column(col, False)
|
|
681
|
+
if not column_refs:
|
|
682
|
+
add_column(item.get("label", ""), True)
|
|
683
|
+
|
|
684
|
+
def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
|
|
685
|
+
"""Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
|
|
686
|
+
unique_columns: Dict[str, bool] = {}
|
|
687
|
+
|
|
688
|
+
# Process regular columns
|
|
689
|
+
for column in form_data.get("all_columns", []):
|
|
690
|
+
self._process_column_item(column, unique_columns)
|
|
691
|
+
|
|
692
|
+
# Process metrics
|
|
693
|
+
# For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
|
|
694
|
+
# For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
|
|
695
|
+
if "metric" in form_data:
|
|
696
|
+
metrics_data = [form_data.get("metric")]
|
|
697
|
+
else:
|
|
698
|
+
metrics_data = form_data.get("metrics", [])
|
|
673
699
|
|
|
674
|
-
|
|
700
|
+
for metric in metrics_data:
|
|
701
|
+
if metric is not None:
|
|
702
|
+
self._process_column_item(metric, unique_columns)
|
|
675
703
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
metric_description = metric.get("description", "")
|
|
704
|
+
# Process group by columns
|
|
705
|
+
for group in form_data.get("groupby", []):
|
|
706
|
+
self._process_column_item(group, unique_columns)
|
|
680
707
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
708
|
+
# Process x-axis columns
|
|
709
|
+
x_axis_data = form_data.get("x_axis")
|
|
710
|
+
if x_axis_data is not None:
|
|
711
|
+
self._process_column_item(x_axis_data, unique_columns)
|
|
684
712
|
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
713
|
+
return unique_columns
|
|
714
|
+
|
|
715
|
+
def _fetch_dataset_columns(
|
|
716
|
+
self, datasource_id: Union[Any, int]
|
|
717
|
+
) -> List[Tuple[str, str, str]]:
|
|
718
|
+
"""Fetch dataset columns and metrics from Superset API."""
|
|
719
|
+
if not datasource_id:
|
|
688
720
|
logger.warning(
|
|
689
721
|
"no datasource id was found, cannot build column level lineage"
|
|
690
722
|
)
|
|
691
723
|
return []
|
|
692
724
|
|
|
725
|
+
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
726
|
+
dataset_column_info = dataset_info.get("columns", [])
|
|
727
|
+
dataset_metric_info = dataset_info.get("metrics", [])
|
|
728
|
+
|
|
729
|
+
dataset_columns: List[Tuple[str, str, str]] = []
|
|
730
|
+
for column in dataset_column_info:
|
|
731
|
+
col_name = column.get("column_name", "")
|
|
732
|
+
col_type = column.get("type", "")
|
|
733
|
+
col_description = column.get("description", "")
|
|
734
|
+
|
|
735
|
+
if col_name == "" or col_type == "":
|
|
736
|
+
logger.info(f"could not construct column lineage for {column}")
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
dataset_columns.append((col_name, col_type, col_description))
|
|
740
|
+
|
|
741
|
+
for metric in dataset_metric_info:
|
|
742
|
+
metric_name = metric.get("metric_name", "")
|
|
743
|
+
metric_type = metric.get("metric_type", "")
|
|
744
|
+
metric_description = metric.get("description", "")
|
|
745
|
+
|
|
746
|
+
if metric_name == "" or metric_type == "":
|
|
747
|
+
logger.info(f"could not construct metric lineage for {metric}")
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
dataset_columns.append((metric_name, metric_type, metric_description))
|
|
751
|
+
|
|
752
|
+
return dataset_columns
|
|
753
|
+
|
|
754
|
+
def _match_chart_columns_with_dataset(
|
|
755
|
+
self,
|
|
756
|
+
unique_chart_columns: Dict[str, bool],
|
|
757
|
+
dataset_columns: List[Tuple[str, str, str]],
|
|
758
|
+
) -> List[Tuple[str, str, str]]:
|
|
759
|
+
"""Match chart columns with dataset columns, preserving SQL/non-SQL status."""
|
|
693
760
|
chart_columns: List[Tuple[str, str, str]] = []
|
|
694
|
-
|
|
695
|
-
|
|
761
|
+
|
|
762
|
+
for chart_col_name, is_sql in unique_chart_columns.items():
|
|
696
763
|
if is_sql:
|
|
697
|
-
chart_columns.append(
|
|
698
|
-
(
|
|
699
|
-
chart_col_name,
|
|
700
|
-
"SQL",
|
|
701
|
-
"",
|
|
702
|
-
)
|
|
703
|
-
)
|
|
764
|
+
chart_columns.append((chart_col_name, "SQL", ""))
|
|
704
765
|
continue
|
|
705
766
|
|
|
706
767
|
# find matching upstream column
|
|
@@ -711,13 +772,36 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
711
772
|
if dataset_col_name == chart_col_name:
|
|
712
773
|
chart_columns.append(
|
|
713
774
|
(chart_col_name, dataset_col_type, dataset_col_description)
|
|
714
|
-
)
|
|
775
|
+
)
|
|
715
776
|
break
|
|
716
|
-
|
|
717
|
-
# if no matching upstream column was found
|
|
718
|
-
if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
|
|
777
|
+
else:
|
|
719
778
|
chart_columns.append((chart_col_name, "", ""))
|
|
720
779
|
|
|
780
|
+
return chart_columns
|
|
781
|
+
|
|
782
|
+
def construct_chart_cll(
|
|
783
|
+
self,
|
|
784
|
+
chart_data: dict,
|
|
785
|
+
datasource_urn: Union[str, None],
|
|
786
|
+
datasource_id: Union[Any, int],
|
|
787
|
+
) -> List[InputField]:
|
|
788
|
+
"""Construct column-level lineage for a chart."""
|
|
789
|
+
form_data = chart_data.get("form_data", {})
|
|
790
|
+
|
|
791
|
+
# Extract and process all columns in one go
|
|
792
|
+
unique_columns = self._collect_all_unique_columns(form_data)
|
|
793
|
+
|
|
794
|
+
# Fetch dataset columns
|
|
795
|
+
dataset_columns = self._fetch_dataset_columns(datasource_id)
|
|
796
|
+
if not dataset_columns:
|
|
797
|
+
return []
|
|
798
|
+
|
|
799
|
+
# Match chart columns with dataset columns
|
|
800
|
+
chart_columns = self._match_chart_columns_with_dataset(
|
|
801
|
+
unique_columns, dataset_columns
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# Build input fields
|
|
721
805
|
return self.build_input_fields(chart_columns, datasource_urn)
|
|
722
806
|
|
|
723
807
|
def construct_chart_from_chart_data(
|
|
@@ -275,6 +275,17 @@ class UnityCatalogSourceConfig(
|
|
|
275
275
|
hidden_from_docs=True,
|
|
276
276
|
)
|
|
277
277
|
|
|
278
|
+
databricks_api_page_size: int = pydantic.Field(
|
|
279
|
+
default=0,
|
|
280
|
+
ge=0,
|
|
281
|
+
description=(
|
|
282
|
+
"Page size for Databricks API calls when listing resources (catalogs, schemas, tables, etc.). "
|
|
283
|
+
"When set to 0 (default), uses server-side configured page length (recommended). "
|
|
284
|
+
"When set to a positive value, the page length is the minimum of this value and the server configured value. "
|
|
285
|
+
"Must be a non-negative integer."
|
|
286
|
+
),
|
|
287
|
+
)
|
|
288
|
+
|
|
278
289
|
include_usage_statistics: bool = Field(
|
|
279
290
|
default=True,
|
|
280
291
|
description="Generate usage statistics.",
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from datahub.api.entities.external.external_entities import (
|
|
4
|
+
PlatformResourceRepository,
|
|
5
|
+
)
|
|
6
|
+
from datahub.ingestion.source.unity.tag_entities import (
|
|
7
|
+
UnityCatalogTagPlatformResource,
|
|
8
|
+
UnityCatalogTagPlatformResourceId,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UnityCatalogPlatformResourceRepository(
|
|
15
|
+
PlatformResourceRepository[
|
|
16
|
+
UnityCatalogTagPlatformResourceId, UnityCatalogTagPlatformResource
|
|
17
|
+
]
|
|
18
|
+
):
|
|
19
|
+
"""Unity Catalog-specific platform resource repository with tag-related operations."""
|