acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (58) hide show
  1. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/METADATA +2460 -2460
  2. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/RECORD +58 -54
  3. datahub/__init__.py +1 -1
  4. datahub/cli/delete_cli.py +3 -3
  5. datahub/cli/migrate.py +2 -2
  6. datahub/emitter/mcp_builder.py +27 -0
  7. datahub/emitter/rest_emitter.py +1 -1
  8. datahub/ingestion/api/source.py +2 -2
  9. datahub/ingestion/graph/client.py +4 -2
  10. datahub/ingestion/source/aws/glue.py +14 -1
  11. datahub/ingestion/source/aws/s3_util.py +24 -1
  12. datahub/ingestion/source/delta_lake/source.py +0 -5
  13. datahub/ingestion/source/demo_data.py +1 -1
  14. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  15. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  16. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
  17. datahub/ingestion/source/iceberg/iceberg.py +10 -3
  18. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  19. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  20. datahub/ingestion/source/kafka_connect/kafka_connect.py +1 -6
  21. datahub/ingestion/source/metabase.py +1 -6
  22. datahub/ingestion/source/mlflow.py +0 -5
  23. datahub/ingestion/source/nifi.py +0 -5
  24. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  25. datahub/ingestion/source/redash.py +0 -5
  26. datahub/ingestion/source/redshift/redshift.py +1 -0
  27. datahub/ingestion/source/s3/source.py +10 -14
  28. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +5 -2
  30. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -20
  31. datahub/ingestion/source/snowflake/snowflake_tag.py +14 -4
  32. datahub/ingestion/source/snowflake/snowflake_v2.py +0 -6
  33. datahub/ingestion/source/sql/sql_types.py +1 -1
  34. datahub/ingestion/source/sql/sql_utils.py +5 -0
  35. datahub/ingestion/source/superset.py +1 -6
  36. datahub/ingestion/source/tableau/tableau.py +0 -6
  37. datahub/metadata/_schema_classes.py +316 -43
  38. datahub/metadata/_urns/urn_defs.py +69 -15
  39. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  40. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  42. datahub/metadata/schema.avsc +296 -87
  43. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  44. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  45. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  46. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  47. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  48. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  49. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  50. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  51. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  52. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  53. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  54. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  55. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  56. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/WHEEL +0 -0
  57. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/entry_points.txt +0 -0
  58. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/top_level.txt +0 -0
@@ -4,12 +4,14 @@ from typing import Dict, Iterable, List, Optional, Union
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import (
7
+ get_sys_time,
7
8
  make_data_platform_urn,
8
9
  make_dataset_urn_with_platform_instance,
9
10
  make_schema_field_urn,
10
11
  make_tag_urn,
11
12
  )
12
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
+ from datahub.emitter.mcp_builder import add_structured_properties_to_entity_wu
13
15
  from datahub.ingestion.api.source import SourceReport
14
16
  from datahub.ingestion.api.workunit import MetadataWorkUnit
15
17
  from datahub.ingestion.glossary.classification_mixin import (
@@ -72,6 +74,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
72
74
  PROFILING,
73
75
  )
74
76
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
77
+ AuditStamp,
75
78
  GlobalTags,
76
79
  Status,
77
80
  SubTypes,
@@ -98,7 +101,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
98
101
  StringType,
99
102
  TimeType,
100
103
  )
104
+ from datahub.metadata.com.linkedin.pegasus2avro.structured import (
105
+ StructuredPropertyDefinition,
106
+ )
101
107
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
108
+ from datahub.metadata.urns import (
109
+ ContainerUrn,
110
+ DatasetUrn,
111
+ DataTypeUrn,
112
+ EntityTypeUrn,
113
+ SchemaFieldUrn,
114
+ StructuredPropertyUrn,
115
+ )
102
116
  from datahub.sql_parsing.sql_parsing_aggregator import (
103
117
  KnownLineageMapping,
104
118
  SqlParsingAggregator,
@@ -673,14 +687,31 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
673
687
  yield from self.gen_dataset_workunits(view, schema_name, db_name)
674
688
 
675
689
  def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
676
- tag_identifier = tag.identifier()
690
+ use_sp = self.config.extract_tags_as_structured_properties
691
+ identifier = (
692
+ self.snowflake_identifier(tag.structured_property_identifier())
693
+ if use_sp
694
+ else tag.tag_identifier()
695
+ )
677
696
 
678
- if self.report.is_tag_processed(tag_identifier):
697
+ if self.report.is_tag_processed(identifier):
679
698
  return
680
699
 
681
- self.report.report_tag_processed(tag_identifier)
682
-
683
- yield from self.gen_tag_workunits(tag)
700
+ self.report.report_tag_processed(identifier)
701
+ if use_sp:
702
+ yield from self.gen_tag_as_structured_property_workunits(tag)
703
+ else:
704
+ yield from self.gen_tag_workunits(tag)
705
+
706
+ def _format_tags_as_structured_properties(
707
+ self, tags: List[SnowflakeTag]
708
+ ) -> Dict[StructuredPropertyUrn, str]:
709
+ return {
710
+ StructuredPropertyUrn(
711
+ self.snowflake_identifier(tag.structured_property_identifier())
712
+ ): tag.value
713
+ for tag in tags
714
+ }
684
715
 
685
716
  def gen_dataset_workunits(
686
717
  self,
@@ -725,6 +756,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
725
756
  env=self.config.env,
726
757
  )
727
758
 
759
+ if self.config.extract_tags_as_structured_properties:
760
+ yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
761
+
728
762
  yield from add_table_to_schema_container(
729
763
  dataset_urn=dataset_urn,
730
764
  parent_container_key=schema_container_key,
@@ -758,16 +792,24 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
758
792
  )
759
793
 
760
794
  if table.tags:
761
- tag_associations = [
762
- TagAssociation(
763
- tag=make_tag_urn(self.snowflake_identifier(tag.identifier()))
795
+ if self.config.extract_tags_as_structured_properties:
796
+ yield from add_structured_properties_to_entity_wu(
797
+ dataset_urn,
798
+ self._format_tags_as_structured_properties(table.tags),
764
799
  )
765
- for tag in table.tags
766
- ]
767
- global_tags = GlobalTags(tag_associations)
768
- yield MetadataChangeProposalWrapper(
769
- entityUrn=dataset_urn, aspect=global_tags
770
- ).as_workunit()
800
+ else:
801
+ tag_associations = [
802
+ TagAssociation(
803
+ tag=make_tag_urn(
804
+ self.snowflake_identifier(tag.tag_identifier())
805
+ )
806
+ )
807
+ for tag in table.tags
808
+ ]
809
+ global_tags = GlobalTags(tag_associations)
810
+ yield MetadataChangeProposalWrapper(
811
+ entityUrn=dataset_urn, aspect=global_tags
812
+ ).as_workunit()
771
813
 
772
814
  if isinstance(table, SnowflakeView) and table.view_definition is not None:
773
815
  view_properties_aspect = ViewProperties(
@@ -840,10 +882,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
840
882
  )
841
883
 
842
884
  def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
843
- tag_urn = make_tag_urn(self.snowflake_identifier(tag.identifier()))
885
+ tag_urn = make_tag_urn(self.snowflake_identifier(tag.tag_identifier()))
844
886
 
845
887
  tag_properties_aspect = TagProperties(
846
- name=tag.display_name(),
888
+ name=tag.tag_display_name(),
847
889
  description=f"Represents the Snowflake tag `{tag._id_prefix_as_str()}` with value `{tag.value}`.",
848
890
  )
849
891
 
@@ -851,6 +893,41 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
851
893
  entityUrn=tag_urn, aspect=tag_properties_aspect
852
894
  ).as_workunit()
853
895
 
896
+ def gen_tag_as_structured_property_workunits(
897
+ self, tag: SnowflakeTag
898
+ ) -> Iterable[MetadataWorkUnit]:
899
+ identifier = self.snowflake_identifier(tag.structured_property_identifier())
900
+ urn = StructuredPropertyUrn(identifier).urn()
901
+ aspect = StructuredPropertyDefinition(
902
+ qualifiedName=identifier,
903
+ displayName=tag.name,
904
+ valueType=DataTypeUrn("datahub.string").urn(),
905
+ entityTypes=[
906
+ EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
907
+ EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
908
+ EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
909
+ ],
910
+ lastModified=AuditStamp(
911
+ time=get_sys_time(), actor="urn:li:corpuser:datahub"
912
+ ),
913
+ )
914
+ yield MetadataChangeProposalWrapper(
915
+ entityUrn=urn,
916
+ aspect=aspect,
917
+ ).as_workunit()
918
+
919
+ def gen_column_tags_as_structured_properties(
920
+ self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
921
+ ) -> Iterable[MetadataWorkUnit]:
922
+ for column_name in table.column_tags:
923
+ schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
924
+ yield from add_structured_properties_to_entity_wu(
925
+ schema_field_urn,
926
+ self._format_tags_as_structured_properties(
927
+ table.column_tags[column_name]
928
+ ),
929
+ )
930
+
854
931
  def gen_schema_metadata(
855
932
  self,
856
933
  table: Union[SnowflakeTable, SnowflakeView],
@@ -892,13 +969,14 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
892
969
  [
893
970
  TagAssociation(
894
971
  make_tag_urn(
895
- self.snowflake_identifier(tag.identifier())
972
+ self.snowflake_identifier(tag.tag_identifier())
896
973
  )
897
974
  )
898
975
  for tag in table.column_tags[col.name]
899
976
  ]
900
977
  )
901
978
  if col.name in table.column_tags
979
+ and not self.config.extract_tags_as_structured_properties
902
980
  else None
903
981
  ),
904
982
  )
@@ -985,8 +1063,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
985
1063
  )
986
1064
  ),
987
1065
  tags=(
988
- [self.snowflake_identifier(tag.identifier()) for tag in database.tags]
1066
+ [
1067
+ self.snowflake_identifier(tag.tag_identifier())
1068
+ for tag in database.tags
1069
+ ]
989
1070
  if database.tags
1071
+ and not self.config.extract_tags_as_structured_properties
1072
+ else None
1073
+ ),
1074
+ structured_properties=(
1075
+ self._format_tags_as_structured_properties(database.tags)
1076
+ if database.tags and self.config.extract_tags_as_structured_properties
990
1077
  else None
991
1078
  ),
992
1079
  )
@@ -1038,8 +1125,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1038
1125
  else None
1039
1126
  ),
1040
1127
  tags=(
1041
- [self.snowflake_identifier(tag.identifier()) for tag in schema.tags]
1042
- if schema.tags
1128
+ [self.snowflake_identifier(tag.tag_identifier()) for tag in schema.tags]
1129
+ if schema.tags and not self.config.extract_tags_as_structured_properties
1130
+ else None
1131
+ ),
1132
+ structured_properties=(
1133
+ self._format_tags_as_structured_properties(schema.tags)
1134
+ if schema.tags and self.config.extract_tags_as_structured_properties
1043
1135
  else None
1044
1136
  ),
1045
1137
  )
@@ -165,10 +165,20 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
165
165
 
166
166
  allowed_tags = []
167
167
  for tag in tags:
168
- tag_identifier = tag.identifier()
169
- self.report.report_entity_scanned(tag_identifier, "tag")
170
- if not self.config.tag_pattern.allowed(tag_identifier):
171
- self.report.report_dropped(tag_identifier)
168
+ identifier = (
169
+ tag._id_prefix_as_str()
170
+ if self.config.extract_tags_as_structured_properties
171
+ else tag.tag_identifier()
172
+ )
173
+ self.report.report_entity_scanned(identifier, "tag")
174
+
175
+ pattern = (
176
+ self.config.structured_property_pattern
177
+ if self.config.extract_tags_as_structured_properties
178
+ else self.config.tag_pattern
179
+ )
180
+ if not pattern.allowed(identifier):
181
+ self.report.report_dropped(identifier)
172
182
  else:
173
183
  allowed_tags.append(tag)
174
184
  return allowed_tags
@@ -23,7 +23,6 @@ from datahub.ingestion.api.incremental_properties_helper import (
23
23
  from datahub.ingestion.api.source import (
24
24
  CapabilityReport,
25
25
  MetadataWorkUnitProcessor,
26
- Source,
27
26
  SourceCapability,
28
27
  SourceReport,
29
28
  TestableSource,
@@ -251,11 +250,6 @@ class SnowflakeV2Source(
251
250
 
252
251
  self.add_config_to_report()
253
252
 
254
- @classmethod
255
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
256
- config = SnowflakeV2Config.parse_obj(config_dict)
257
- return cls(ctx, config)
258
-
259
253
  @staticmethod
260
254
  def test_connection(config_dict: dict) -> TestConnectionReport:
261
255
  test_report = TestConnectionReport()
@@ -93,7 +93,7 @@ POSTGRES_TYPES_MAP: Dict[str, Any] = {
93
93
  "regtype": None,
94
94
  "regrole": None,
95
95
  "regnamespace": None,
96
- "super": None,
96
+ "super": NullType,
97
97
  "uuid": StringType,
98
98
  "pg_lsn": None,
99
99
  "tsvector": None, # text search vector
@@ -20,6 +20,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
20
20
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
21
21
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
22
22
  from datahub.metadata.schema_classes import DataPlatformInstanceClass
23
+ from datahub.metadata.urns import StructuredPropertyUrn
23
24
  from datahub.utilities.registries.domain_registry import DomainRegistry
24
25
  from datahub.utilities.urns.dataset_urn import DatasetUrn
25
26
 
@@ -75,6 +76,7 @@ def gen_schema_container(
75
76
  created: Optional[int] = None,
76
77
  last_modified: Optional[int] = None,
77
78
  extra_properties: Optional[Dict[str, str]] = None,
79
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
78
80
  ) -> Iterable[MetadataWorkUnit]:
79
81
  domain_urn: Optional[str] = None
80
82
  if domain_registry:
@@ -99,6 +101,7 @@ def gen_schema_container(
99
101
  owner_urn=owner_urn,
100
102
  qualified_name=qualified_name,
101
103
  extra_properties=extra_properties,
104
+ structured_properties=structured_properties,
102
105
  )
103
106
 
104
107
 
@@ -133,6 +136,7 @@ def gen_database_container(
133
136
  created: Optional[int] = None,
134
137
  last_modified: Optional[int] = None,
135
138
  extra_properties: Optional[Dict[str, str]] = None,
139
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
136
140
  ) -> Iterable[MetadataWorkUnit]:
137
141
  domain_urn: Optional[str] = None
138
142
  if domain_registry:
@@ -154,6 +158,7 @@ def gen_database_container(
154
158
  owner_urn=owner_urn,
155
159
  qualified_name=qualified_name,
156
160
  extra_properties=extra_properties,
161
+ structured_properties=structured_properties,
157
162
  )
158
163
 
159
164
 
@@ -33,7 +33,7 @@ from datahub.ingestion.api.decorators import (
33
33
  platform_name,
34
34
  support_status,
35
35
  )
36
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
36
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
39
  from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
@@ -265,11 +265,6 @@ class SupersetSource(StatefulIngestionSourceBase):
265
265
  # TODO(Gabe): how should we message about this error?
266
266
  return requests_session
267
267
 
268
- @classmethod
269
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
270
- config = SupersetConfig.parse_obj(config_dict)
271
- return cls(ctx, config)
272
-
273
268
  def paginate_entity_api_results(self, entity_type, page_size=100):
274
269
  current_page = 0
275
270
  total_items = page_size
@@ -71,7 +71,6 @@ from datahub.ingestion.api.decorators import (
71
71
  from datahub.ingestion.api.source import (
72
72
  CapabilityReport,
73
73
  MetadataWorkUnitProcessor,
74
- Source,
75
74
  StructuredLogLevel,
76
75
  TestableSource,
77
76
  TestConnectionReport,
@@ -804,11 +803,6 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
804
803
  def get_report(self) -> TableauSourceReport:
805
804
  return self.report
806
805
 
807
- @classmethod
808
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
809
- config = TableauConfig.parse_obj(config_dict)
810
- return cls(config, ctx)
811
-
812
806
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
813
807
  return [
814
808
  *super().get_workunit_processors(),