acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/METADATA +2575 -2575
  2. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/RECORD +52 -45
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +21 -4
  5. datahub/ingestion/api/decorators.py +14 -3
  6. datahub/ingestion/api/report.py +123 -2
  7. datahub/ingestion/api/source.py +45 -44
  8. datahub/ingestion/autogenerated/lineage_helper.py +193 -0
  9. datahub/ingestion/graph/client.py +71 -28
  10. datahub/ingestion/run/pipeline.py +6 -0
  11. datahub/ingestion/source/aws/glue.py +1 -1
  12. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  13. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  14. datahub/ingestion/source/bigquery_v2/queries.py +4 -4
  15. datahub/ingestion/source/common/subtypes.py +43 -0
  16. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  17. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  18. datahub/ingestion/source/hex/api.py +26 -1
  19. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  20. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
  21. datahub/ingestion/source/salesforce.py +6 -3
  22. datahub/ingestion/source/slack/slack.py +2 -1
  23. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
  24. datahub/ingestion/source/sql/athena.py +15 -3
  25. datahub/ingestion/source/sql/mssql/source.py +9 -0
  26. datahub/ingestion/source/sql/sql_common.py +3 -0
  27. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  28. datahub/ingestion/source/sql/teradata.py +4 -1
  29. datahub/ingestion/source/sql/vertica.py +9 -1
  30. datahub/ingestion/source/tableau/tableau.py +6 -1
  31. datahub/ingestion/source/unity/source.py +36 -20
  32. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  33. datahub/metadata/_internal_schema_classes.py +601 -0
  34. datahub/metadata/_urns/urn_defs.py +112 -0
  35. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  36. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  37. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  38. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  39. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  40. datahub/metadata/schema.avsc +383 -0
  41. datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
  42. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  43. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
  44. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  45. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  46. datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
  47. datahub/sdk/datajob.py +39 -15
  48. datahub/specific/dataproduct.py +4 -0
  49. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/licenses/LICENSE +0 -0
  52. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/top_level.txt +0 -0
@@ -663,6 +663,7 @@ class SnowflakeQueriesSource(Source):
663
663
  def close(self) -> None:
664
664
  self.connection.close()
665
665
  self.queries_extractor.close()
666
+ super().close()
666
667
 
667
668
 
668
669
  # Make sure we don't try to generate too much info for a single query.
@@ -29,7 +29,10 @@ from datahub.ingestion.api.decorators import (
29
29
  from datahub.ingestion.api.source import StructuredLogLevel
30
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
32
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
32
+ from datahub.ingestion.source.common.subtypes import (
33
+ DatasetContainerSubTypes,
34
+ SourceCapabilityModifier,
35
+ )
33
36
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
34
37
  from datahub.ingestion.source.sql.sql_common import (
35
38
  SQLAlchemySource,
@@ -321,9 +324,18 @@ class Partitionitem:
321
324
  @capability(
322
325
  SourceCapability.DATA_PROFILING,
323
326
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
327
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
328
+ )
329
+ @capability(
330
+ SourceCapability.LINEAGE_COARSE,
331
+ "Supported for S3 tables",
332
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
333
+ )
334
+ @capability(
335
+ SourceCapability.LINEAGE_FINE,
336
+ "Supported for S3 tables",
337
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
324
338
  )
325
- @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
326
- @capability(SourceCapability.LINEAGE_FINE, "Supported for S3 tables")
327
339
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
328
340
  class AthenaSource(SQLAlchemySource):
329
341
  """
@@ -27,6 +27,7 @@ from datahub.ingestion.api.decorators import (
27
27
  from datahub.ingestion.api.source import StructuredLogLevel
28
28
  from datahub.ingestion.api.source_helpers import auto_workunit
29
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
30
31
  from datahub.ingestion.source.sql.mssql.job_models import (
31
32
  JobStep,
32
33
  MSSQLDataFlow,
@@ -177,10 +178,18 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
177
178
  @capability(
178
179
  SourceCapability.LINEAGE_COARSE,
179
180
  "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
181
+ subtype_modifier=[
182
+ SourceCapabilityModifier.STORED_PROCEDURE,
183
+ SourceCapabilityModifier.VIEW,
184
+ ],
180
185
  )
181
186
  @capability(
182
187
  SourceCapability.LINEAGE_FINE,
183
188
  "Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
189
+ subtype_modifier=[
190
+ SourceCapabilityModifier.STORED_PROCEDURE,
191
+ SourceCapabilityModifier.VIEW,
192
+ ],
184
193
  )
185
194
  class SQLServerSource(SQLAlchemySource):
186
195
  """
@@ -54,6 +54,7 @@ from datahub.ingestion.source.common.data_reader import DataReader
54
54
  from datahub.ingestion.source.common.subtypes import (
55
55
  DatasetContainerSubTypes,
56
56
  DatasetSubTypes,
57
+ SourceCapabilityModifier,
57
58
  )
58
59
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
59
60
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
@@ -305,10 +306,12 @@ class ProfileMetadata:
305
306
  @capability(
306
307
  SourceCapability.LINEAGE_COARSE,
307
308
  "Enabled by default to get lineage for views via `include_view_lineage`",
309
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
308
310
  )
309
311
  @capability(
310
312
  SourceCapability.LINEAGE_FINE,
311
313
  "Enabled by default to get lineage for views via `include_view_column_lineage`",
314
+ subtype_modifier=[SourceCapabilityModifier.VIEW],
312
315
  )
313
316
  @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
314
317
  @capability(
@@ -57,10 +57,11 @@ class GenericProfiler:
57
57
  platform: Optional[str] = None,
58
58
  profiler_args: Optional[Dict] = None,
59
59
  ) -> Iterable[MetadataWorkUnit]:
60
+ # We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
60
61
  ge_profile_requests: List[GEProfilerRequest] = [
61
62
  cast(GEProfilerRequest, request)
62
63
  for request in requests
63
- if not request.profile_table_level_only
64
+ if not request.profile_table_level_only or request.table.rows_count == 0
64
65
  ]
65
66
  table_level_profile_requests: List[TableProfilerRequest] = [
66
67
  request for request in requests if request.profile_table_level_only
@@ -445,7 +445,10 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
445
445
  @capability(SourceCapability.DOMAINS, "Enabled by default")
446
446
  @capability(SourceCapability.CONTAINERS, "Enabled by default")
447
447
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
448
- @capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration")
448
+ @capability(
449
+ SourceCapability.DELETION_DETECTION,
450
+ "Enabled by default when stateful ingestion is turned on",
451
+ )
449
452
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
450
453
  @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
451
454
  @capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration")
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
25
25
  )
26
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
+ from datahub.ingestion.source.common.subtypes import (
29
+ DatasetSubTypes,
30
+ SourceCapabilityModifier,
31
+ )
28
32
  from datahub.ingestion.source.sql.sql_common import (
29
33
  SQLAlchemySource,
30
34
  SqlWorkUnit,
@@ -113,6 +117,10 @@ class VerticaConfig(BasicSQLAlchemyConfig):
113
117
  @capability(
114
118
  SourceCapability.LINEAGE_COARSE,
115
119
  "Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
120
+ subtype_modifier=[
121
+ SourceCapabilityModifier.VIEW,
122
+ SourceCapabilityModifier.PROJECTIONS,
123
+ ],
116
124
  )
117
125
  @capability(
118
126
  SourceCapability.DELETION_DETECTION,
@@ -497,7 +505,7 @@ class VerticaSource(SQLAlchemySource):
497
505
  changeType=ChangeTypeClass.UPSERT,
498
506
  entityUrn=dataset_urn,
499
507
  aspectName="subTypes",
500
- aspect=SubTypesClass(typeNames=["Projections"]),
508
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
501
509
  ).as_workunit()
502
510
 
503
511
  if self.config.domain:
@@ -80,6 +80,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
80
80
  from datahub.ingestion.source.common.subtypes import (
81
81
  BIContainerSubTypes,
82
82
  DatasetSubTypes,
83
+ SourceCapabilityModifier,
83
84
  )
84
85
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
85
86
  StaleEntityRemovalHandler,
@@ -867,10 +868,14 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
867
868
  @capability(
868
869
  SourceCapability.USAGE_STATS,
869
870
  "Dashboard/Chart view counts, enabled using extract_usage_stats config",
871
+ subtype_modifier=[
872
+ SourceCapabilityModifier.DASHBOARD,
873
+ SourceCapabilityModifier.CHART,
874
+ ],
870
875
  )
871
876
  @capability(
872
877
  SourceCapability.DELETION_DETECTION,
873
- "Enabled by default when stateful ingestion is turned on.",
878
+ "Enabled by default via stateful ingestion.",
874
879
  )
875
880
  @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration")
876
881
  @capability(SourceCapability.TAGS, "Requires recipe configuration")
@@ -1020,29 +1020,45 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1020
1020
  ) -> Iterable[MetadataWorkUnit]:
1021
1021
  if self.ctx.graph and self.platform_resource_repository:
1022
1022
  for tag in tags:
1023
- platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1024
- platform_instance=self.platform_instance_name,
1025
- platform_resource_repository=self.platform_resource_repository,
1026
- tag=tag,
1027
- )
1028
- logger.debug(f"Created platform resource {platform_resource_id}")
1023
+ try:
1024
+ platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1025
+ platform_instance=self.platform_instance_name,
1026
+ platform_resource_repository=self.platform_resource_repository,
1027
+ tag=tag,
1028
+ )
1029
+ logger.debug(f"Created platform resource {platform_resource_id}")
1029
1030
 
1030
- unity_catalog_tag = UnityCatalogTagPlatformResource.get_from_datahub(
1031
- platform_resource_id, self.platform_resource_repository, False
1032
- )
1033
- if (
1034
- tag.to_datahub_tag_urn().urn()
1035
- not in unity_catalog_tag.datahub_linked_resources().urns
1036
- ):
1037
- unity_catalog_tag.datahub_linked_resources().add(
1038
- tag.to_datahub_tag_urn().urn()
1031
+ unity_catalog_tag = (
1032
+ UnityCatalogTagPlatformResource.get_from_datahub(
1033
+ platform_resource_id,
1034
+ self.platform_resource_repository,
1035
+ False,
1036
+ )
1039
1037
  )
1040
- platform_resource = unity_catalog_tag.as_platform_resource()
1041
- for mcp in platform_resource.to_mcps():
1042
- yield MetadataWorkUnit(
1043
- id=f"platform_resource-{platform_resource.id}",
1044
- mcp=mcp,
1038
+ if (
1039
+ tag.to_datahub_tag_urn().urn()
1040
+ not in unity_catalog_tag.datahub_linked_resources().urns
1041
+ ):
1042
+ unity_catalog_tag.datahub_linked_resources().add(
1043
+ tag.to_datahub_tag_urn().urn()
1045
1044
  )
1045
+ platform_resource = unity_catalog_tag.as_platform_resource()
1046
+ for mcp in platform_resource.to_mcps():
1047
+ yield MetadataWorkUnit(
1048
+ id=f"platform_resource-{platform_resource.id}",
1049
+ mcp=mcp,
1050
+ )
1051
+ except Exception as e:
1052
+ logger.exception(
1053
+ f"Error processing platform resource for tag {tag}"
1054
+ )
1055
+ self.report.report_warning(
1056
+ message="Error processing platform resource for tag",
1057
+ context=str(tag),
1058
+ title="Error processing platform resource for tag",
1059
+ exc=e,
1060
+ )
1061
+ continue
1046
1062
 
1047
1063
  def _create_schema_metadata_aspect(
1048
1064
  self, table: Table
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
71
71
 
72
72
  server_ownership = graph.get_ownership(entity_urn=urn)
73
73
  if server_ownership:
74
- owners = {owner.owner: owner for owner in server_ownership.owners}
75
- owners.update({owner.owner: owner for owner in mce_ownership.owners})
74
+ owners = {
75
+ (
76
+ owner.owner,
77
+ owner.type,
78
+ owner.typeUrn,
79
+ ): owner
80
+ for owner in server_ownership.owners
81
+ }
82
+ owners.update(
83
+ {
84
+ (
85
+ owner.owner,
86
+ owner.type,
87
+ owner.typeUrn,
88
+ ): owner
89
+ for owner in mce_ownership.owners
90
+ }
91
+ )
76
92
  mce_ownership.owners = list(owners.values())
77
93
 
78
94
  return mce_ownership