acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (106) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  59. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  62. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  63. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  64. datahub/ingestion/source/sql/mssql/source.py +8 -4
  65. datahub/ingestion/source/sql/oracle.py +51 -4
  66. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  67. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  68. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  69. datahub/ingestion/source/superset.py +291 -35
  70. datahub/ingestion/source/usage/usage_common.py +0 -65
  71. datahub/ingestion/source/vertexai/__init__.py +0 -0
  72. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  73. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  74. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  75. datahub/metadata/_schema_classes.py +472 -1
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  80. datahub/metadata/schema.avsc +313 -2
  81. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  82. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  83. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  84. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  85. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  86. datahub/metadata/schemas/Deprecation.avsc +2 -0
  87. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  89. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  90. datahub/metadata/schemas/Siblings.avsc +2 -0
  91. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  92. datahub/sdk/__init__.py +1 -0
  93. datahub/sdk/dataset.py +122 -0
  94. datahub/sdk/entity.py +99 -3
  95. datahub/sdk/entity_client.py +27 -3
  96. datahub/sdk/main_client.py +24 -1
  97. datahub/sdk/search_client.py +81 -8
  98. datahub/sdk/search_filters.py +94 -37
  99. datahub/sql_parsing/split_statements.py +17 -3
  100. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  101. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  102. datahub/testing/mcp_diff.py +1 -18
  103. datahub/utilities/threaded_iterator_executor.py +16 -3
  104. datahub/ingestion/source/vertexai.py +0 -697
  105. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  106. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,13 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
30
30
  ReferencedTable,
31
31
  )
32
32
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
- from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
33
+ from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
+ from datahub.sql_parsing.sqlglot_lineage import (
35
+ ColumnLineageInfo,
36
+ ColumnRef,
37
+ DownstreamColumnRef,
38
+ SqlParsingResult,
39
+ )
34
40
 
35
41
  logger = logging.getLogger(__name__)
36
42
 
@@ -262,6 +268,33 @@ class AbstractLineage(ABC):
262
268
  ),
263
269
  )
264
270
 
271
+ def create_table_column_lineage(self, urn: str) -> List[ColumnLineageInfo]:
272
+ column_lineage = []
273
+
274
+ if self.table.columns is not None:
275
+ for column in self.table.columns:
276
+ downstream = DownstreamColumnRef(
277
+ table=self.table.name,
278
+ column=column.name,
279
+ column_type=SchemaFieldDataTypeClass(type=column.datahubDataType),
280
+ native_column_type=column.dataType or "UNKNOWN",
281
+ )
282
+
283
+ upstreams = [
284
+ ColumnRef(
285
+ table=urn,
286
+ column=column.name.lower(),
287
+ )
288
+ ]
289
+
290
+ column_lineage_info = ColumnLineageInfo(
291
+ downstream=downstream, upstreams=upstreams
292
+ )
293
+
294
+ column_lineage.append(column_lineage_info)
295
+
296
+ return column_lineage
297
+
265
298
 
266
299
  class AmazonRedshiftLineage(AbstractLineage):
267
300
  def get_platform_pair(self) -> DataPlatformPair:
@@ -299,6 +332,8 @@ class AmazonRedshiftLineage(AbstractLineage):
299
332
  qualified_table_name=qualified_table_name,
300
333
  )
301
334
 
335
+ column_lineage = self.create_table_column_lineage(urn)
336
+
302
337
  return Lineage(
303
338
  upstreams=[
304
339
  DataPlatformTable(
@@ -306,7 +341,7 @@ class AmazonRedshiftLineage(AbstractLineage):
306
341
  urn=urn,
307
342
  )
308
343
  ],
309
- column_lineage=[],
344
+ column_lineage=column_lineage,
310
345
  )
311
346
 
312
347
 
@@ -364,6 +399,8 @@ class OracleLineage(AbstractLineage):
364
399
  qualified_table_name=qualified_table_name,
365
400
  )
366
401
 
402
+ column_lineage = self.create_table_column_lineage(urn)
403
+
367
404
  return Lineage(
368
405
  upstreams=[
369
406
  DataPlatformTable(
@@ -371,7 +408,7 @@ class OracleLineage(AbstractLineage):
371
408
  urn=urn,
372
409
  )
373
410
  ],
374
- column_lineage=[],
411
+ column_lineage=column_lineage,
375
412
  )
376
413
 
377
414
 
@@ -449,6 +486,8 @@ class DatabricksLineage(AbstractLineage):
449
486
  qualified_table_name=qualified_table_name,
450
487
  )
451
488
 
489
+ column_lineage = self.create_table_column_lineage(urn)
490
+
452
491
  return Lineage(
453
492
  upstreams=[
454
493
  DataPlatformTable(
@@ -456,7 +495,7 @@ class DatabricksLineage(AbstractLineage):
456
495
  urn=urn,
457
496
  )
458
497
  ],
459
- column_lineage=[],
498
+ column_lineage=column_lineage,
460
499
  )
461
500
 
462
501
  return Lineage.empty()
@@ -509,6 +548,9 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
509
548
  server=server,
510
549
  qualified_table_name=qualified_table_name,
511
550
  )
551
+
552
+ column_lineage = self.create_table_column_lineage(urn)
553
+
512
554
  return Lineage(
513
555
  upstreams=[
514
556
  DataPlatformTable(
@@ -516,10 +558,62 @@ class TwoStepDataAccessPattern(AbstractLineage, ABC):
516
558
  urn=urn,
517
559
  )
518
560
  ],
519
- column_lineage=[],
561
+ column_lineage=column_lineage,
520
562
  )
521
563
 
522
564
 
565
+ class MySQLLineage(AbstractLineage):
566
+ def create_lineage(
567
+ self, data_access_func_detail: DataAccessFunctionDetail
568
+ ) -> Lineage:
569
+ logger.debug(
570
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}"
571
+ )
572
+
573
+ server, db_name = self.get_db_detail_from_argument(
574
+ data_access_func_detail.arg_list
575
+ )
576
+ if server is None or db_name is None:
577
+ return Lineage.empty() # Return an empty list
578
+
579
+ schema_name: str = cast(
580
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
581
+ ).items["Schema"]
582
+
583
+ table_name: str = cast(
584
+ IdentifierAccessor, data_access_func_detail.identifier_accessor
585
+ ).items["Item"]
586
+
587
+ qualified_table_name: str = f"{schema_name}.{table_name}"
588
+
589
+ logger.debug(
590
+ f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
591
+ )
592
+
593
+ urn = make_urn(
594
+ config=self.config,
595
+ platform_instance_resolver=self.platform_instance_resolver,
596
+ data_platform_pair=self.get_platform_pair(),
597
+ server=server,
598
+ qualified_table_name=qualified_table_name,
599
+ )
600
+
601
+ column_lineage = self.create_table_column_lineage(urn)
602
+
603
+ return Lineage(
604
+ upstreams=[
605
+ DataPlatformTable(
606
+ data_platform_pair=self.get_platform_pair(),
607
+ urn=urn,
608
+ )
609
+ ],
610
+ column_lineage=column_lineage,
611
+ )
612
+
613
+ def get_platform_pair(self) -> DataPlatformPair:
614
+ return SupportedDataPlatform.MYSQL.value
615
+
616
+
523
617
  class PostgresLineage(TwoStepDataAccessPattern):
524
618
  def create_lineage(
525
619
  self, data_access_func_detail: DataAccessFunctionDetail
@@ -671,6 +765,8 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
671
765
  qualified_table_name=qualified_table_name,
672
766
  )
673
767
 
768
+ column_lineage = self.create_table_column_lineage(urn)
769
+
674
770
  return Lineage(
675
771
  upstreams=[
676
772
  DataPlatformTable(
@@ -678,7 +774,7 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
678
774
  urn=urn,
679
775
  )
680
776
  ],
681
- column_lineage=[],
777
+ column_lineage=column_lineage,
682
778
  )
683
779
 
684
780
 
@@ -726,6 +822,7 @@ class NativeQueryLineage(AbstractLineage):
726
822
 
727
823
  tables: List[str] = native_sql_parser.get_tables(query)
728
824
 
825
+ column_lineage = []
729
826
  for qualified_table_name in tables:
730
827
  if len(qualified_table_name.split(".")) != 3:
731
828
  logger.debug(
@@ -748,12 +845,11 @@ class NativeQueryLineage(AbstractLineage):
748
845
  )
749
846
  )
750
847
 
848
+ column_lineage = self.create_table_column_lineage(urn)
849
+
751
850
  logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
752
851
 
753
- return Lineage(
754
- upstreams=dataplatform_tables,
755
- column_lineage=[],
756
- )
852
+ return Lineage(upstreams=dataplatform_tables, column_lineage=column_lineage)
757
853
 
758
854
  def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]:
759
855
  if (
@@ -885,6 +981,11 @@ class SupportedPattern(Enum):
885
981
  FunctionName.AMAZON_REDSHIFT_DATA_ACCESS,
886
982
  )
887
983
 
984
+ MYSQL = (
985
+ MySQLLineage,
986
+ FunctionName.MYSQL_DATA_ACCESS,
987
+ )
988
+
888
989
  NATIVE_QUERY = (
889
990
  NativeQueryLineage,
890
991
  FunctionName.NATIVE_QUERY,
@@ -361,6 +361,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
361
361
  )
362
362
 
363
363
  if output_variable is None:
364
+ logger.debug(
365
+ f"Table: {self.table.full_name}: output-variable not found in tree"
366
+ )
364
367
  self.reporter.report_warning(
365
368
  f"{self.table.full_name}-output-variable",
366
369
  "output-variable not found in table expression",
@@ -374,6 +377,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
374
377
 
375
378
  # Each item is data-access function
376
379
  for f_detail in table_links:
380
+ logger.debug(
381
+ f"Processing data-access-function {f_detail.data_access_function_name}"
382
+ )
377
383
  # Get & Check if we support data-access-function available in M-Query
378
384
  supported_resolver = SupportedPattern.get_pattern_handler(
379
385
  f_detail.data_access_function_name
@@ -390,6 +396,10 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
390
396
 
391
397
  # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
392
398
  # & also pass additional information that will be need to generate lineage
399
+ logger.debug(
400
+ f"Creating instance of {supported_resolver.handler().__name__} "
401
+ f"for data-access-function {f_detail.data_access_function_name}"
402
+ )
393
403
  pattern_handler: AbstractLineage = supported_resolver.handler()(
394
404
  ctx=ctx,
395
405
  table=self.table,
@@ -3,6 +3,7 @@
3
3
  # Meta Data Ingestion From the Power BI Source
4
4
  #
5
5
  #########################################################
6
+ import functools
6
7
  import logging
7
8
  from datetime import datetime
8
9
  from typing import Iterable, List, Optional, Tuple, Union
@@ -24,6 +25,7 @@ from datahub.ingestion.api.decorators import (
24
25
  support_status,
25
26
  )
26
27
  from datahub.ingestion.api.incremental_lineage_helper import (
28
+ auto_incremental_lineage,
27
29
  convert_dashboard_info_to_patch,
28
30
  )
29
31
  from datahub.ingestion.api.source import (
@@ -238,6 +240,10 @@ class Mapper:
238
240
  upstream: List[UpstreamClass] = []
239
241
  cll_lineage: List[FineGrainedLineage] = []
240
242
 
243
+ logger.debug(
244
+ f"Extracting lineage for table {table.full_name} in dataset {table.dataset.name if table.dataset else None}"
245
+ )
246
+
241
247
  upstream_lineage: List[
242
248
  datahub.ingestion.source.powerbi.m_query.data_classes.Lineage
243
249
  ] = parser.get_upstream_tables(
@@ -666,6 +672,7 @@ class Mapper:
666
672
  workspace: powerbi_data_classes.Workspace,
667
673
  chart_mcps: List[MetadataChangeProposalWrapper],
668
674
  user_mcps: List[MetadataChangeProposalWrapper],
675
+ dashboard_edges: List[EdgeClass],
669
676
  ) -> List[MetadataChangeProposalWrapper]:
670
677
  """
671
678
  Map PowerBi dashboard to Datahub dashboard
@@ -695,6 +702,7 @@ class Mapper:
695
702
  lastModified=ChangeAuditStamps(),
696
703
  dashboardUrl=dashboard.webUrl,
697
704
  customProperties={**chart_custom_properties(dashboard)},
705
+ dashboards=dashboard_edges,
698
706
  )
699
707
 
700
708
  info_mcp = self.new_mcp(
@@ -933,7 +941,7 @@ class Mapper:
933
941
  dashboard: powerbi_data_classes.Dashboard,
934
942
  workspace: powerbi_data_classes.Workspace,
935
943
  ) -> List[EquableMetadataWorkUnit]:
936
- mcps = []
944
+ mcps: List[MetadataChangeProposalWrapper] = []
937
945
 
938
946
  logger.info(
939
947
  f"Converting dashboard={dashboard.displayName} to datahub dashboard"
@@ -945,9 +953,30 @@ class Mapper:
945
953
  )
946
954
  # Convert tiles to charts
947
955
  ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
956
+
957
+ # collect all downstream reports (dashboards)
958
+ dashboard_edges = []
959
+ for t in dashboard.tiles:
960
+ if t.report:
961
+ dashboard_urn = builder.make_dashboard_urn(
962
+ platform=self.__config.platform_name,
963
+ platform_instance=self.__config.platform_instance,
964
+ name=t.report.get_urn_part(),
965
+ )
966
+ edge = EdgeClass(
967
+ destinationUrn=dashboard_urn,
968
+ )
969
+ dashboard_edges.append(edge)
970
+
948
971
  # Lets convert dashboard to datahub dashboard
949
972
  dashboard_mcps: List[MetadataChangeProposalWrapper] = (
950
- self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
973
+ self.to_datahub_dashboard_mcp(
974
+ dashboard=dashboard,
975
+ workspace=workspace,
976
+ chart_mcps=chart_mcps,
977
+ user_mcps=user_mcps,
978
+ dashboard_edges=dashboard_edges,
979
+ )
951
980
  )
952
981
 
953
982
  # Now add MCPs in sequence
@@ -1054,7 +1083,6 @@ class Mapper:
1054
1083
  report: powerbi_data_classes.Report,
1055
1084
  chart_mcps: List[MetadataChangeProposalWrapper],
1056
1085
  user_mcps: List[MetadataChangeProposalWrapper],
1057
- dashboard_edges: List[EdgeClass],
1058
1086
  ) -> List[MetadataChangeProposalWrapper]:
1059
1087
  """
1060
1088
  Map PowerBi report to Datahub dashboard
@@ -1076,7 +1104,6 @@ class Mapper:
1076
1104
  charts=chart_urn_list,
1077
1105
  lastModified=ChangeAuditStamps(),
1078
1106
  dashboardUrl=report.webUrl,
1079
- dashboards=dashboard_edges,
1080
1107
  )
1081
1108
 
1082
1109
  info_mcp = self.new_mcp(
@@ -1170,27 +1197,12 @@ class Mapper:
1170
1197
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1171
1198
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1172
1199
 
1173
- # find all dashboards with a Tile referencing this report
1174
- downstream_dashboards_edges = []
1175
- for d in workspace.dashboards.values():
1176
- if any(t.report_id == report.id for t in d.tiles):
1177
- dashboard_urn = builder.make_dashboard_urn(
1178
- platform=self.__config.platform_name,
1179
- platform_instance=self.__config.platform_instance,
1180
- name=d.get_urn_part(),
1181
- )
1182
- edge = EdgeClass(
1183
- destinationUrn=dashboard_urn,
1184
- sourceUrn=None,
1185
- created=None,
1186
- lastModified=None,
1187
- properties=None,
1188
- )
1189
- downstream_dashboards_edges.append(edge)
1190
-
1191
1200
  # Let's convert report to datahub dashboard
1192
1201
  report_mcps = self.report_to_dashboard(
1193
- workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
1202
+ workspace=workspace,
1203
+ report=report,
1204
+ chart_mcps=chart_mcps,
1205
+ user_mcps=user_mcps,
1194
1206
  )
1195
1207
 
1196
1208
  # Now add MCPs in sequence
@@ -1300,7 +1312,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1300
1312
 
1301
1313
  allowed_workspaces = []
1302
1314
  for workspace in all_workspaces:
1303
- if not self.source_config.workspace_id_pattern.allowed(workspace.id):
1315
+ if not self.source_config.workspace_id_pattern.allowed(
1316
+ workspace.id
1317
+ ) or not self.source_config.workspace_name_pattern.allowed(workspace.name):
1304
1318
  self.reporter.filtered_workspace_names.append(
1305
1319
  f"{workspace.id} - {workspace.name}"
1306
1320
  )
@@ -1516,6 +1530,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
1516
1530
  else:
1517
1531
  return [
1518
1532
  *super().get_workunit_processors(),
1533
+ functools.partial(
1534
+ auto_incremental_lineage, self.source_config.incremental_lineage
1535
+ ),
1519
1536
  self.stale_entity_removal_handler.workunit_processor,
1520
1537
  ]
1521
1538
 
@@ -115,7 +115,7 @@ class PowerBiAPI:
115
115
  if scan_result is None:
116
116
  return results
117
117
 
118
- for scanned_dashboard in scan_result.get(Constant.DASHBOARDS, []):
118
+ for scanned_dashboard in scan_result.get(Constant.DASHBOARDS) or []:
119
119
  # Iterate through response and create a list of PowerBiAPI.Dashboard
120
120
  dashboard_id = scanned_dashboard.get("id")
121
121
  tags = self._parse_endorsement(
@@ -133,17 +133,17 @@ class PowerBiAPI:
133
133
  if scan_result is None:
134
134
  return results
135
135
 
136
- reports: List[dict] = scan_result.get(Constant.REPORTS, [])
136
+ reports: List[dict] = scan_result.get(Constant.REPORTS) or []
137
137
 
138
138
  for report in reports:
139
- report_id = report.get(Constant.ID, None)
139
+ report_id = report.get(Constant.ID)
140
140
  if report_id is None:
141
141
  logger.warning(
142
142
  f"Report id is none. Skipping endorsement tag for report instance {report}"
143
143
  )
144
144
  continue
145
145
  endorsements = self._parse_endorsement(
146
- report.get(Constant.ENDORSEMENT_DETAIL, None)
146
+ report.get(Constant.ENDORSEMENT_DETAIL)
147
147
  )
148
148
  results[report_id] = endorsements
149
149
 
@@ -339,7 +339,7 @@ class PowerBiAPI:
339
339
  if not endorsements:
340
340
  return []
341
341
 
342
- endorsement = endorsements.get(Constant.ENDORSEMENT, None)
342
+ endorsement = endorsements.get(Constant.ENDORSEMENT)
343
343
  if not endorsement:
344
344
  return []
345
345
 
@@ -396,7 +396,7 @@ class PowerBiAPI:
396
396
 
397
397
  if self.__config.extract_endorsements_to_tags:
398
398
  dataset_instance.tags = self._parse_endorsement(
399
- dataset_dict.get(Constant.ENDORSEMENT_DETAIL, None)
399
+ dataset_dict.get(Constant.ENDORSEMENT_DETAIL)
400
400
  )
401
401
 
402
402
  dataset_map[dataset_instance.id] = dataset_instance
@@ -407,7 +407,7 @@ class PowerBiAPI:
407
407
  else dataset_instance.id
408
408
  )
409
409
  logger.debug(f"dataset_dict = {dataset_dict}")
410
- for table in dataset_dict.get(Constant.TABLES, []):
410
+ for table in dataset_dict.get(Constant.TABLES) or []:
411
411
  expression: Optional[str] = (
412
412
  table[Constant.SOURCE][0][Constant.EXPRESSION]
413
413
  if table.get(Constant.SOURCE) is not None
@@ -430,10 +430,10 @@ class PowerBiAPI:
430
430
  column["dataType"], FIELD_TYPE_MAPPING["Null"]
431
431
  ),
432
432
  )
433
- for column in table.get("columns", [])
433
+ for column in table.get("columns") or []
434
434
  ],
435
435
  measures=[
436
- Measure(**measure) for measure in table.get("measures", [])
436
+ Measure(**measure) for measure in table.get("measures") or []
437
437
  ],
438
438
  dataset=dataset_instance,
439
439
  row_count=None,
@@ -480,7 +480,7 @@ class PowerBiAPI:
480
480
  )
481
481
  )
482
482
  if app_id is None: # In PowerBI one workspace can have one app
483
- app_id = report.get(Constant.APP_ID)
483
+ app_id = report[Constant.APP_ID]
484
484
 
485
485
  raw_app_dashboards: List[Dict] = []
486
486
  # Filter app dashboards
@@ -488,7 +488,7 @@ class PowerBiAPI:
488
488
  if dashboard.get(Constant.APP_ID):
489
489
  raw_app_dashboards.append(dashboard)
490
490
  if app_id is None: # In PowerBI, one workspace contains one app
491
- app_id = report[Constant.APP_ID]
491
+ app_id = dashboard[Constant.APP_ID]
492
492
 
493
493
  # workspace doesn't have an App. Above two loops can be avoided
494
494
  # if app_id is available at root level in workspace_metadata
@@ -230,7 +230,8 @@ class RedshiftSqlLineageV2(Closeable):
230
230
  )
231
231
 
232
232
  # Populate lineage for external tables.
233
- self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
233
+ if not self.config.skip_external_tables:
234
+ self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
234
235
 
235
236
  def _populate_lineage_agg(
236
237
  self,
@@ -400,6 +401,10 @@ class RedshiftSqlLineageV2(Closeable):
400
401
  db_schemas: Dict[str, Dict[str, RedshiftSchema]],
401
402
  ) -> None:
402
403
  for schema_name, tables in all_tables[self.database].items():
404
+ logger.info(f"External table lineage: checking schema {schema_name}")
405
+ if not db_schemas[self.database].get(schema_name):
406
+ logger.warning(f"Schema {schema_name} not found")
407
+ continue
403
408
  for table in tables:
404
409
  schema = db_schemas[self.database][schema_name]
405
410
  if (
@@ -407,6 +412,9 @@ class RedshiftSqlLineageV2(Closeable):
407
412
  and schema.is_external_schema()
408
413
  and schema.external_platform
409
414
  ):
415
+ logger.info(
416
+ f"External table lineage: processing table {schema_name}.{table.name}"
417
+ )
410
418
  # external_db_params = schema.option
411
419
  upstream_platform = schema.external_platform.lower()
412
420
 
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
44
44
  SELECT
45
45
  schema_name,
46
46
  schema_type,
47
- schema_option,
47
+ cast(null as varchar(1024)) as schema_option,
48
48
  cast(null as varchar(256)) as external_platform,
49
49
  cast(null as varchar(256)) as external_database
50
50
  FROM svv_redshift_schemas
@@ -945,6 +945,17 @@ class S3Source(StatefulIngestionSourceBase):
945
945
  for f in list_folders(
946
946
  bucket_name, f"{folder}", self.source_config.aws_config
947
947
  ):
948
+ table_path = self.create_s3_path(bucket_name, f)
949
+ table_name, _ = path_spec.extract_table_name_and_path(
950
+ table_path
951
+ )
952
+ if not path_spec.tables_filter_pattern.allowed(table_name):
953
+ logger.debug(
954
+ f"Table '{table_name}' not allowed and skipping"
955
+ )
956
+ self.report.report_file_dropped(table_path)
957
+ continue
958
+
948
959
  dirs_to_process = []
949
960
  logger.info(f"Processing folder: {f}")
950
961
  if path_spec.traversal_method == FolderTraversalMethod.ALL:
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.ingestion.api.report import EntityFilterReport
12
13
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
13
14
  StaleEntityRemovalSourceReport,
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -54,16 +55,14 @@ class Constant:
54
55
 
55
56
  @dataclass
56
57
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
57
- number_of_workspaces: int = 0
58
+ workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
59
+ number_of_workspaces: Optional[int] = None
58
60
  non_accessible_workspaces_count: int = 0
59
61
  shared_entities_count: int = 0
60
62
  number_of_datasets: int = 0
61
63
  number_of_workbooks: int = 0
62
64
  number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
63
65
 
64
- def report_number_of_workspaces(self, number_of_workspaces: int) -> None:
65
- self.number_of_workspaces = number_of_workspaces
66
-
67
66
 
68
67
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
69
68
  data_source_platform: str = pydantic.Field(
@@ -162,14 +162,17 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
162
162
 
163
163
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
164
  all_workspaces = self.sigma_api.workspaces.values()
165
- allowed_workspaces = [
166
- workspace
167
- for workspace in all_workspaces
168
- if self.config.workspace_pattern.allowed(workspace.name)
169
- ]
170
165
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
171
- self.reporter.report_number_of_workspaces(len(all_workspaces))
166
+ self.reporter.number_of_workspaces = len(all_workspaces)
167
+
168
+ allowed_workspaces = []
169
+ for workspace in all_workspaces:
170
+ if self.config.workspace_pattern.allowed(workspace.name):
171
+ allowed_workspaces.append(workspace)
172
+ else:
173
+ self.reporter.workspaces.dropped(workspace.workspaceId)
172
174
  logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
175
+
173
176
  return allowed_workspaces
174
177
 
175
178
  def _gen_workspace_workunit(
@@ -658,6 +661,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
658
661
  yield from self._gen_workbook_workunit(workbook)
659
662
 
660
663
  for workspace in self._get_allowed_workspaces():
664
+ self.reporter.workspaces.processed(workspace.workspaceId)
661
665
  yield from self._gen_workspace_workunit(workspace)
662
666
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
663
667