acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,13 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
|
30
30
|
ReferencedTable,
|
|
31
31
|
)
|
|
32
32
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
|
-
from datahub.
|
|
33
|
+
from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
|
|
34
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
35
|
+
ColumnLineageInfo,
|
|
36
|
+
ColumnRef,
|
|
37
|
+
DownstreamColumnRef,
|
|
38
|
+
SqlParsingResult,
|
|
39
|
+
)
|
|
34
40
|
|
|
35
41
|
logger = logging.getLogger(__name__)
|
|
36
42
|
|
|
@@ -262,6 +268,33 @@ class AbstractLineage(ABC):
|
|
|
262
268
|
),
|
|
263
269
|
)
|
|
264
270
|
|
|
271
|
+
def create_table_column_lineage(self, urn: str) -> List[ColumnLineageInfo]:
|
|
272
|
+
column_lineage = []
|
|
273
|
+
|
|
274
|
+
if self.table.columns is not None:
|
|
275
|
+
for column in self.table.columns:
|
|
276
|
+
downstream = DownstreamColumnRef(
|
|
277
|
+
table=self.table.name,
|
|
278
|
+
column=column.name,
|
|
279
|
+
column_type=SchemaFieldDataTypeClass(type=column.datahubDataType),
|
|
280
|
+
native_column_type=column.dataType or "UNKNOWN",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
upstreams = [
|
|
284
|
+
ColumnRef(
|
|
285
|
+
table=urn,
|
|
286
|
+
column=column.name.lower(),
|
|
287
|
+
)
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
column_lineage_info = ColumnLineageInfo(
|
|
291
|
+
downstream=downstream, upstreams=upstreams
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
column_lineage.append(column_lineage_info)
|
|
295
|
+
|
|
296
|
+
return column_lineage
|
|
297
|
+
|
|
265
298
|
|
|
266
299
|
class AmazonRedshiftLineage(AbstractLineage):
|
|
267
300
|
def get_platform_pair(self) -> DataPlatformPair:
|
|
@@ -299,6 +332,8 @@ class AmazonRedshiftLineage(AbstractLineage):
|
|
|
299
332
|
qualified_table_name=qualified_table_name,
|
|
300
333
|
)
|
|
301
334
|
|
|
335
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
336
|
+
|
|
302
337
|
return Lineage(
|
|
303
338
|
upstreams=[
|
|
304
339
|
DataPlatformTable(
|
|
@@ -306,7 +341,7 @@ class AmazonRedshiftLineage(AbstractLineage):
|
|
|
306
341
|
urn=urn,
|
|
307
342
|
)
|
|
308
343
|
],
|
|
309
|
-
column_lineage=
|
|
344
|
+
column_lineage=column_lineage,
|
|
310
345
|
)
|
|
311
346
|
|
|
312
347
|
|
|
@@ -364,6 +399,8 @@ class OracleLineage(AbstractLineage):
|
|
|
364
399
|
qualified_table_name=qualified_table_name,
|
|
365
400
|
)
|
|
366
401
|
|
|
402
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
403
|
+
|
|
367
404
|
return Lineage(
|
|
368
405
|
upstreams=[
|
|
369
406
|
DataPlatformTable(
|
|
@@ -371,7 +408,7 @@ class OracleLineage(AbstractLineage):
|
|
|
371
408
|
urn=urn,
|
|
372
409
|
)
|
|
373
410
|
],
|
|
374
|
-
column_lineage=
|
|
411
|
+
column_lineage=column_lineage,
|
|
375
412
|
)
|
|
376
413
|
|
|
377
414
|
|
|
@@ -449,6 +486,8 @@ class DatabricksLineage(AbstractLineage):
|
|
|
449
486
|
qualified_table_name=qualified_table_name,
|
|
450
487
|
)
|
|
451
488
|
|
|
489
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
490
|
+
|
|
452
491
|
return Lineage(
|
|
453
492
|
upstreams=[
|
|
454
493
|
DataPlatformTable(
|
|
@@ -456,7 +495,7 @@ class DatabricksLineage(AbstractLineage):
|
|
|
456
495
|
urn=urn,
|
|
457
496
|
)
|
|
458
497
|
],
|
|
459
|
-
column_lineage=
|
|
498
|
+
column_lineage=column_lineage,
|
|
460
499
|
)
|
|
461
500
|
|
|
462
501
|
return Lineage.empty()
|
|
@@ -509,6 +548,9 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
509
548
|
server=server,
|
|
510
549
|
qualified_table_name=qualified_table_name,
|
|
511
550
|
)
|
|
551
|
+
|
|
552
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
553
|
+
|
|
512
554
|
return Lineage(
|
|
513
555
|
upstreams=[
|
|
514
556
|
DataPlatformTable(
|
|
@@ -516,10 +558,62 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
516
558
|
urn=urn,
|
|
517
559
|
)
|
|
518
560
|
],
|
|
519
|
-
column_lineage=
|
|
561
|
+
column_lineage=column_lineage,
|
|
520
562
|
)
|
|
521
563
|
|
|
522
564
|
|
|
565
|
+
class MySQLLineage(AbstractLineage):
|
|
566
|
+
def create_lineage(
|
|
567
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
568
|
+
) -> Lineage:
|
|
569
|
+
logger.debug(
|
|
570
|
+
f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
server, db_name = self.get_db_detail_from_argument(
|
|
574
|
+
data_access_func_detail.arg_list
|
|
575
|
+
)
|
|
576
|
+
if server is None or db_name is None:
|
|
577
|
+
return Lineage.empty() # Return an empty list
|
|
578
|
+
|
|
579
|
+
schema_name: str = cast(
|
|
580
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
581
|
+
).items["Schema"]
|
|
582
|
+
|
|
583
|
+
table_name: str = cast(
|
|
584
|
+
IdentifierAccessor, data_access_func_detail.identifier_accessor
|
|
585
|
+
).items["Item"]
|
|
586
|
+
|
|
587
|
+
qualified_table_name: str = f"{schema_name}.{table_name}"
|
|
588
|
+
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
urn = make_urn(
|
|
594
|
+
config=self.config,
|
|
595
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
596
|
+
data_platform_pair=self.get_platform_pair(),
|
|
597
|
+
server=server,
|
|
598
|
+
qualified_table_name=qualified_table_name,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
602
|
+
|
|
603
|
+
return Lineage(
|
|
604
|
+
upstreams=[
|
|
605
|
+
DataPlatformTable(
|
|
606
|
+
data_platform_pair=self.get_platform_pair(),
|
|
607
|
+
urn=urn,
|
|
608
|
+
)
|
|
609
|
+
],
|
|
610
|
+
column_lineage=column_lineage,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
614
|
+
return SupportedDataPlatform.MYSQL.value
|
|
615
|
+
|
|
616
|
+
|
|
523
617
|
class PostgresLineage(TwoStepDataAccessPattern):
|
|
524
618
|
def create_lineage(
|
|
525
619
|
self, data_access_func_detail: DataAccessFunctionDetail
|
|
@@ -671,6 +765,8 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
671
765
|
qualified_table_name=qualified_table_name,
|
|
672
766
|
)
|
|
673
767
|
|
|
768
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
769
|
+
|
|
674
770
|
return Lineage(
|
|
675
771
|
upstreams=[
|
|
676
772
|
DataPlatformTable(
|
|
@@ -678,7 +774,7 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
678
774
|
urn=urn,
|
|
679
775
|
)
|
|
680
776
|
],
|
|
681
|
-
column_lineage=
|
|
777
|
+
column_lineage=column_lineage,
|
|
682
778
|
)
|
|
683
779
|
|
|
684
780
|
|
|
@@ -726,6 +822,7 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
726
822
|
|
|
727
823
|
tables: List[str] = native_sql_parser.get_tables(query)
|
|
728
824
|
|
|
825
|
+
column_lineage = []
|
|
729
826
|
for qualified_table_name in tables:
|
|
730
827
|
if len(qualified_table_name.split(".")) != 3:
|
|
731
828
|
logger.debug(
|
|
@@ -748,12 +845,11 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
748
845
|
)
|
|
749
846
|
)
|
|
750
847
|
|
|
848
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
849
|
+
|
|
751
850
|
logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
|
|
752
851
|
|
|
753
|
-
return Lineage(
|
|
754
|
-
upstreams=dataplatform_tables,
|
|
755
|
-
column_lineage=[],
|
|
756
|
-
)
|
|
852
|
+
return Lineage(upstreams=dataplatform_tables, column_lineage=column_lineage)
|
|
757
853
|
|
|
758
854
|
def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
|
|
759
855
|
if (
|
|
@@ -885,6 +981,11 @@ class SupportedPattern(Enum):
|
|
|
885
981
|
FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
|
|
886
982
|
)
|
|
887
983
|
|
|
984
|
+
MYSQL = (
|
|
985
|
+
MySQLLineage,
|
|
986
|
+
FunctionName.MYSQL_DATA_ACCESS,
|
|
987
|
+
)
|
|
988
|
+
|
|
888
989
|
NATIVE_QUERY = (
|
|
889
990
|
NativeQueryLineage,
|
|
890
991
|
FunctionName.NATIVE_QUERY,
|
|
@@ -361,6 +361,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
361
361
|
)
|
|
362
362
|
|
|
363
363
|
if output_variable is None:
|
|
364
|
+
logger.debug(
|
|
365
|
+
f"Table: {self.table.full_name}: output-variable not found in tree"
|
|
366
|
+
)
|
|
364
367
|
self.reporter.report_warning(
|
|
365
368
|
f"{self.table.full_name}-output-variable",
|
|
366
369
|
"output-variable not found in table expression",
|
|
@@ -374,6 +377,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
374
377
|
|
|
375
378
|
# Each item is data-access function
|
|
376
379
|
for f_detail in table_links:
|
|
380
|
+
logger.debug(
|
|
381
|
+
f"Processing data-access-function {f_detail.data_access_function_name}"
|
|
382
|
+
)
|
|
377
383
|
# Get & Check if we support data-access-function available in M-Query
|
|
378
384
|
supported_resolver = SupportedPattern.get_pattern_handler(
|
|
379
385
|
f_detail.data_access_function_name
|
|
@@ -390,6 +396,10 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
390
396
|
|
|
391
397
|
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
392
398
|
# & also pass additional information that will be need to generate lineage
|
|
399
|
+
logger.debug(
|
|
400
|
+
f"Creating instance of {supported_resolver.handler().__name__} "
|
|
401
|
+
f"for data-access-function {f_detail.data_access_function_name}"
|
|
402
|
+
)
|
|
393
403
|
pattern_handler: AbstractLineage = supported_resolver.handler()(
|
|
394
404
|
ctx=ctx,
|
|
395
405
|
table=self.table,
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# Meta Data Ingestion From the Power BI Source
|
|
4
4
|
#
|
|
5
5
|
#########################################################
|
|
6
|
+
import functools
|
|
6
7
|
import logging
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from typing import Iterable, List, Optional, Tuple, Union
|
|
@@ -24,6 +25,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
24
25
|
support_status,
|
|
25
26
|
)
|
|
26
27
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
28
|
+
auto_incremental_lineage,
|
|
27
29
|
convert_dashboard_info_to_patch,
|
|
28
30
|
)
|
|
29
31
|
from datahub.ingestion.api.source import (
|
|
@@ -238,6 +240,10 @@ class Mapper:
|
|
|
238
240
|
upstream: List[UpstreamClass] = []
|
|
239
241
|
cll_lineage: List[FineGrainedLineage] = []
|
|
240
242
|
|
|
243
|
+
logger.debug(
|
|
244
|
+
f"Extracting lineage for table {table.full_name} in dataset {table.dataset.name if table.dataset else None}"
|
|
245
|
+
)
|
|
246
|
+
|
|
241
247
|
upstream_lineage: List[
|
|
242
248
|
datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
|
|
243
249
|
] = parser.get_upstream_tables(
|
|
@@ -666,6 +672,7 @@ class Mapper:
|
|
|
666
672
|
workspace: powerbi_data_classes.Workspace,
|
|
667
673
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
668
674
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
675
|
+
dashboard_edges: List[EdgeClass],
|
|
669
676
|
) -> List[MetadataChangeProposalWrapper]:
|
|
670
677
|
"""
|
|
671
678
|
Map PowerBi dashboard to Datahub dashboard
|
|
@@ -695,6 +702,7 @@ class Mapper:
|
|
|
695
702
|
lastModified=ChangeAuditStamps(),
|
|
696
703
|
dashboardUrl=dashboard.webUrl,
|
|
697
704
|
customProperties={**chart_custom_properties(dashboard)},
|
|
705
|
+
dashboards=dashboard_edges,
|
|
698
706
|
)
|
|
699
707
|
|
|
700
708
|
info_mcp = self.new_mcp(
|
|
@@ -933,7 +941,7 @@ class Mapper:
|
|
|
933
941
|
dashboard: powerbi_data_classes.Dashboard,
|
|
934
942
|
workspace: powerbi_data_classes.Workspace,
|
|
935
943
|
) -> List[EquableMetadataWorkUnit]:
|
|
936
|
-
mcps = []
|
|
944
|
+
mcps: List[MetadataChangeProposalWrapper] = []
|
|
937
945
|
|
|
938
946
|
logger.info(
|
|
939
947
|
f"Converting dashboard={dashboard.displayName} to datahub dashboard"
|
|
@@ -945,9 +953,30 @@ class Mapper:
|
|
|
945
953
|
)
|
|
946
954
|
# Convert tiles to charts
|
|
947
955
|
ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
|
|
956
|
+
|
|
957
|
+
# collect all downstream reports (dashboards)
|
|
958
|
+
dashboard_edges = []
|
|
959
|
+
for t in dashboard.tiles:
|
|
960
|
+
if t.report:
|
|
961
|
+
dashboard_urn = builder.make_dashboard_urn(
|
|
962
|
+
platform=self.__config.platform_name,
|
|
963
|
+
platform_instance=self.__config.platform_instance,
|
|
964
|
+
name=t.report.get_urn_part(),
|
|
965
|
+
)
|
|
966
|
+
edge = EdgeClass(
|
|
967
|
+
destinationUrn=dashboard_urn,
|
|
968
|
+
)
|
|
969
|
+
dashboard_edges.append(edge)
|
|
970
|
+
|
|
948
971
|
# Lets convert dashboard to datahub dashboard
|
|
949
972
|
dashboard_mcps: List[MetadataChangeProposalWrapper] = (
|
|
950
|
-
self.to_datahub_dashboard_mcp(
|
|
973
|
+
self.to_datahub_dashboard_mcp(
|
|
974
|
+
dashboard=dashboard,
|
|
975
|
+
workspace=workspace,
|
|
976
|
+
chart_mcps=chart_mcps,
|
|
977
|
+
user_mcps=user_mcps,
|
|
978
|
+
dashboard_edges=dashboard_edges,
|
|
979
|
+
)
|
|
951
980
|
)
|
|
952
981
|
|
|
953
982
|
# Now add MCPs in sequence
|
|
@@ -1054,7 +1083,6 @@ class Mapper:
|
|
|
1054
1083
|
report: powerbi_data_classes.Report,
|
|
1055
1084
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
1056
1085
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
1057
|
-
dashboard_edges: List[EdgeClass],
|
|
1058
1086
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1059
1087
|
"""
|
|
1060
1088
|
Map PowerBi report to Datahub dashboard
|
|
@@ -1076,7 +1104,6 @@ class Mapper:
|
|
|
1076
1104
|
charts=chart_urn_list,
|
|
1077
1105
|
lastModified=ChangeAuditStamps(),
|
|
1078
1106
|
dashboardUrl=report.webUrl,
|
|
1079
|
-
dashboards=dashboard_edges,
|
|
1080
1107
|
)
|
|
1081
1108
|
|
|
1082
1109
|
info_mcp = self.new_mcp(
|
|
@@ -1170,27 +1197,12 @@ class Mapper:
|
|
|
1170
1197
|
ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
|
|
1171
1198
|
chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
|
|
1172
1199
|
|
|
1173
|
-
# find all dashboards with a Tile referencing this report
|
|
1174
|
-
downstream_dashboards_edges = []
|
|
1175
|
-
for d in workspace.dashboards.values():
|
|
1176
|
-
if any(t.report_id == report.id for t in d.tiles):
|
|
1177
|
-
dashboard_urn = builder.make_dashboard_urn(
|
|
1178
|
-
platform=self.__config.platform_name,
|
|
1179
|
-
platform_instance=self.__config.platform_instance,
|
|
1180
|
-
name=d.get_urn_part(),
|
|
1181
|
-
)
|
|
1182
|
-
edge = EdgeClass(
|
|
1183
|
-
destinationUrn=dashboard_urn,
|
|
1184
|
-
sourceUrn=None,
|
|
1185
|
-
created=None,
|
|
1186
|
-
lastModified=None,
|
|
1187
|
-
properties=None,
|
|
1188
|
-
)
|
|
1189
|
-
downstream_dashboards_edges.append(edge)
|
|
1190
|
-
|
|
1191
1200
|
# Let's convert report to datahub dashboard
|
|
1192
1201
|
report_mcps = self.report_to_dashboard(
|
|
1193
|
-
workspace,
|
|
1202
|
+
workspace=workspace,
|
|
1203
|
+
report=report,
|
|
1204
|
+
chart_mcps=chart_mcps,
|
|
1205
|
+
user_mcps=user_mcps,
|
|
1194
1206
|
)
|
|
1195
1207
|
|
|
1196
1208
|
# Now add MCPs in sequence
|
|
@@ -1300,7 +1312,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1300
1312
|
|
|
1301
1313
|
allowed_workspaces = []
|
|
1302
1314
|
for workspace in all_workspaces:
|
|
1303
|
-
if not self.source_config.workspace_id_pattern.allowed(
|
|
1315
|
+
if not self.source_config.workspace_id_pattern.allowed(
|
|
1316
|
+
workspace.id
|
|
1317
|
+
) or not self.source_config.workspace_name_pattern.allowed(workspace.name):
|
|
1304
1318
|
self.reporter.filtered_workspace_names.append(
|
|
1305
1319
|
f"{workspace.id} - {workspace.name}"
|
|
1306
1320
|
)
|
|
@@ -1516,6 +1530,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1516
1530
|
else:
|
|
1517
1531
|
return [
|
|
1518
1532
|
*super().get_workunit_processors(),
|
|
1533
|
+
functools.partial(
|
|
1534
|
+
auto_incremental_lineage, self.source_config.incremental_lineage
|
|
1535
|
+
),
|
|
1519
1536
|
self.stale_entity_removal_handler.workunit_processor,
|
|
1520
1537
|
]
|
|
1521
1538
|
|
|
@@ -115,7 +115,7 @@ class PowerBiAPI:
|
|
|
115
115
|
if scan_result is None:
|
|
116
116
|
return results
|
|
117
117
|
|
|
118
|
-
for scanned_dashboard in scan_result.get(Constant.DASHBOARDS
|
|
118
|
+
for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
|
|
119
119
|
# Iterate through response and create a list of PowerBiAPI.Dashboard
|
|
120
120
|
dashboard_id = scanned_dashboard.get("id")
|
|
121
121
|
tags = self._parse_endorsement(
|
|
@@ -133,17 +133,17 @@ class PowerBiAPI:
|
|
|
133
133
|
if scan_result is None:
|
|
134
134
|
return results
|
|
135
135
|
|
|
136
|
-
reports: List[dict] = scan_result.get(Constant.REPORTS
|
|
136
|
+
reports: List[dict] = scan_result.get(Constant.REPORTS) or []
|
|
137
137
|
|
|
138
138
|
for report in reports:
|
|
139
|
-
report_id = report.get(Constant.ID
|
|
139
|
+
report_id = report.get(Constant.ID)
|
|
140
140
|
if report_id is None:
|
|
141
141
|
logger.warning(
|
|
142
142
|
f"Report id is none. Skipping endorsement tag for report instance {report}"
|
|
143
143
|
)
|
|
144
144
|
continue
|
|
145
145
|
endorsements = self._parse_endorsement(
|
|
146
|
-
report.get(Constant.ENDORSEMENT_DETAIL
|
|
146
|
+
report.get(Constant.ENDORSEMENT_DETAIL)
|
|
147
147
|
)
|
|
148
148
|
results[report_id] = endorsements
|
|
149
149
|
|
|
@@ -339,7 +339,7 @@ class PowerBiAPI:
|
|
|
339
339
|
if not endorsements:
|
|
340
340
|
return []
|
|
341
341
|
|
|
342
|
-
endorsement = endorsements.get(Constant.ENDORSEMENT
|
|
342
|
+
endorsement = endorsements.get(Constant.ENDORSEMENT)
|
|
343
343
|
if not endorsement:
|
|
344
344
|
return []
|
|
345
345
|
|
|
@@ -396,7 +396,7 @@ class PowerBiAPI:
|
|
|
396
396
|
|
|
397
397
|
if self.__config.extract_endorsements_to_tags:
|
|
398
398
|
dataset_instance.tags = self._parse_endorsement(
|
|
399
|
-
dataset_dict.get(Constant.ENDORSEMENT_DETAIL
|
|
399
|
+
dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
|
|
400
400
|
)
|
|
401
401
|
|
|
402
402
|
dataset_map[dataset_instance.id] = dataset_instance
|
|
@@ -407,7 +407,7 @@ class PowerBiAPI:
|
|
|
407
407
|
else dataset_instance.id
|
|
408
408
|
)
|
|
409
409
|
logger.debug(f"dataset_dict = {dataset_dict}")
|
|
410
|
-
for table in dataset_dict.get(Constant.TABLES
|
|
410
|
+
for table in dataset_dict.get(Constant.TABLES) or []:
|
|
411
411
|
expression: Optional[str] = (
|
|
412
412
|
table[Constant.SOURCE][0][Constant.EXPRESSION]
|
|
413
413
|
if table.get(Constant.SOURCE) is not None
|
|
@@ -430,10 +430,10 @@ class PowerBiAPI:
|
|
|
430
430
|
column["dataType"], FIELD_TYPE_MAPPING["Null"]
|
|
431
431
|
),
|
|
432
432
|
)
|
|
433
|
-
for column in table.get("columns"
|
|
433
|
+
for column in table.get("columns") or []
|
|
434
434
|
],
|
|
435
435
|
measures=[
|
|
436
|
-
Measure(**measure) for measure in table.get("measures"
|
|
436
|
+
Measure(**measure) for measure in table.get("measures") or []
|
|
437
437
|
],
|
|
438
438
|
dataset=dataset_instance,
|
|
439
439
|
row_count=None,
|
|
@@ -480,7 +480,7 @@ class PowerBiAPI:
|
|
|
480
480
|
)
|
|
481
481
|
)
|
|
482
482
|
if app_id is None: # In PowerBI one workspace can have one app
|
|
483
|
-
app_id = report
|
|
483
|
+
app_id = report[Constant.APP_ID]
|
|
484
484
|
|
|
485
485
|
raw_app_dashboards: List[Dict] = []
|
|
486
486
|
# Filter app dashboards
|
|
@@ -488,7 +488,7 @@ class PowerBiAPI:
|
|
|
488
488
|
if dashboard.get(Constant.APP_ID):
|
|
489
489
|
raw_app_dashboards.append(dashboard)
|
|
490
490
|
if app_id is None: # In PowerBI, one workspace contains one app
|
|
491
|
-
app_id =
|
|
491
|
+
app_id = dashboard[Constant.APP_ID]
|
|
492
492
|
|
|
493
493
|
# workspace doesn't have an App. Above two loops can be avoided
|
|
494
494
|
# if app_id is available at root level in workspace_metadata
|
|
@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
230
230
|
)
|
|
231
231
|
|
|
232
232
|
# Populate lineage for external tables.
|
|
233
|
-
self.
|
|
233
|
+
if not self.config.skip_external_tables:
|
|
234
|
+
self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
|
|
234
235
|
|
|
235
236
|
def _populate_lineage_agg(
|
|
236
237
|
self,
|
|
@@ -400,6 +401,10 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
400
401
|
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
401
402
|
) -> None:
|
|
402
403
|
for schema_name, tables in all_tables[self.database].items():
|
|
404
|
+
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
405
|
+
if not db_schemas[self.database].get(schema_name):
|
|
406
|
+
logger.warning(f"Schema {schema_name} not found")
|
|
407
|
+
continue
|
|
403
408
|
for table in tables:
|
|
404
409
|
schema = db_schemas[self.database][schema_name]
|
|
405
410
|
if (
|
|
@@ -407,6 +412,9 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
407
412
|
and schema.is_external_schema()
|
|
408
413
|
and schema.external_platform
|
|
409
414
|
):
|
|
415
|
+
logger.info(
|
|
416
|
+
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
417
|
+
)
|
|
410
418
|
# external_db_params = schema.option
|
|
411
419
|
upstream_platform = schema.external_platform.lower()
|
|
412
420
|
|
|
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
|
|
|
44
44
|
SELECT
|
|
45
45
|
schema_name,
|
|
46
46
|
schema_type,
|
|
47
|
-
schema_option,
|
|
47
|
+
cast(null as varchar(1024)) as schema_option,
|
|
48
48
|
cast(null as varchar(256)) as external_platform,
|
|
49
49
|
cast(null as varchar(256)) as external_database
|
|
50
50
|
FROM svv_redshift_schemas
|
|
@@ -945,6 +945,17 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
945
945
|
for f in list_folders(
|
|
946
946
|
bucket_name, f"{folder}", self.source_config.aws_config
|
|
947
947
|
):
|
|
948
|
+
table_path = self.create_s3_path(bucket_name, f)
|
|
949
|
+
table_name, _ = path_spec.extract_table_name_and_path(
|
|
950
|
+
table_path
|
|
951
|
+
)
|
|
952
|
+
if not path_spec.tables_filter_pattern.allowed(table_name):
|
|
953
|
+
logger.debug(
|
|
954
|
+
f"Table '{table_name}' not allowed and skipping"
|
|
955
|
+
)
|
|
956
|
+
self.report.report_file_dropped(table_path)
|
|
957
|
+
continue
|
|
958
|
+
|
|
948
959
|
dirs_to_process = []
|
|
949
960
|
logger.info(f"Processing folder: {f}")
|
|
950
961
|
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.ingestion.api.report import EntityFilterReport
|
|
12
13
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
13
14
|
StaleEntityRemovalSourceReport,
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -54,16 +55,14 @@ class Constant:
|
|
|
54
55
|
|
|
55
56
|
@dataclass
|
|
56
57
|
class SigmaSourceReport(StaleEntityRemovalSourceReport):
|
|
57
|
-
|
|
58
|
+
workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
|
|
59
|
+
number_of_workspaces: Optional[int] = None
|
|
58
60
|
non_accessible_workspaces_count: int = 0
|
|
59
61
|
shared_entities_count: int = 0
|
|
60
62
|
number_of_datasets: int = 0
|
|
61
63
|
number_of_workbooks: int = 0
|
|
62
64
|
number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
|
|
63
65
|
|
|
64
|
-
def report_number_of_workspaces(self, number_of_workspaces: int) -> None:
|
|
65
|
-
self.number_of_workspaces = number_of_workspaces
|
|
66
|
-
|
|
67
66
|
|
|
68
67
|
class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
69
68
|
data_source_platform: str = pydantic.Field(
|
|
@@ -162,14 +162,17 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
162
162
|
|
|
163
163
|
def _get_allowed_workspaces(self) -> List[Workspace]:
|
|
164
164
|
all_workspaces = self.sigma_api.workspaces.values()
|
|
165
|
-
allowed_workspaces = [
|
|
166
|
-
workspace
|
|
167
|
-
for workspace in all_workspaces
|
|
168
|
-
if self.config.workspace_pattern.allowed(workspace.name)
|
|
169
|
-
]
|
|
170
165
|
logger.info(f"Number of workspaces = {len(all_workspaces)}")
|
|
171
|
-
self.reporter.
|
|
166
|
+
self.reporter.number_of_workspaces = len(all_workspaces)
|
|
167
|
+
|
|
168
|
+
allowed_workspaces = []
|
|
169
|
+
for workspace in all_workspaces:
|
|
170
|
+
if self.config.workspace_pattern.allowed(workspace.name):
|
|
171
|
+
allowed_workspaces.append(workspace)
|
|
172
|
+
else:
|
|
173
|
+
self.reporter.workspaces.dropped(workspace.workspaceId)
|
|
172
174
|
logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
|
|
175
|
+
|
|
173
176
|
return allowed_workspaces
|
|
174
177
|
|
|
175
178
|
def _gen_workspace_workunit(
|
|
@@ -658,6 +661,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
658
661
|
yield from self._gen_workbook_workunit(workbook)
|
|
659
662
|
|
|
660
663
|
for workspace in self._get_allowed_workspaces():
|
|
664
|
+
self.reporter.workspaces.processed(workspace.workspaceId)
|
|
661
665
|
yield from self._gen_workspace_workunit(workspace)
|
|
662
666
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
663
667
|
|