acryl-datahub 0.15.0.2rc4__py3-none-any.whl → 0.15.0.2rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (50) hide show
  1. {acryl_datahub-0.15.0.2rc4.dist-info → acryl_datahub-0.15.0.2rc6.dist-info}/METADATA +2440 -2440
  2. {acryl_datahub-0.15.0.2rc4.dist-info → acryl_datahub-0.15.0.2rc6.dist-info}/RECORD +50 -46
  3. datahub/__init__.py +1 -1
  4. datahub/cli/delete_cli.py +3 -3
  5. datahub/cli/migrate.py +2 -2
  6. datahub/emitter/mcp_builder.py +27 -0
  7. datahub/emitter/rest_emitter.py +1 -1
  8. datahub/ingestion/api/source.py +2 -2
  9. datahub/ingestion/source/delta_lake/source.py +0 -5
  10. datahub/ingestion/source/demo_data.py +1 -1
  11. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  12. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +6 -2
  13. datahub/ingestion/source/iceberg/iceberg.py +10 -3
  14. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  15. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  16. datahub/ingestion/source/kafka_connect/kafka_connect.py +1 -6
  17. datahub/ingestion/source/metabase.py +1 -6
  18. datahub/ingestion/source/mlflow.py +0 -5
  19. datahub/ingestion/source/nifi.py +0 -5
  20. datahub/ingestion/source/redash.py +0 -5
  21. datahub/ingestion/source/redshift/redshift.py +1 -0
  22. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  23. datahub/ingestion/source/snowflake/snowflake_schema.py +5 -2
  24. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -20
  25. datahub/ingestion/source/snowflake/snowflake_tag.py +14 -4
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +0 -6
  27. datahub/ingestion/source/sql/sql_types.py +1 -1
  28. datahub/ingestion/source/sql/sql_utils.py +5 -0
  29. datahub/ingestion/source/superset.py +1 -6
  30. datahub/ingestion/source/tableau/tableau.py +0 -6
  31. datahub/metadata/_schema_classes.py +314 -41
  32. datahub/metadata/_urns/urn_defs.py +54 -0
  33. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  34. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  35. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  36. datahub/metadata/schema.avsc +296 -87
  37. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  38. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  39. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  40. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  41. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  42. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  43. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  44. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  45. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  46. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  47. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  48. {acryl_datahub-0.15.0.2rc4.dist-info → acryl_datahub-0.15.0.2rc6.dist-info}/WHEEL +0 -0
  49. {acryl_datahub-0.15.0.2rc4.dist-info → acryl_datahub-0.15.0.2rc6.dist-info}/entry_points.txt +0 -0
  50. {acryl_datahub-0.15.0.2rc4.dist-info → acryl_datahub-0.15.0.2rc6.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
7
7
  from pyiceberg.catalog import Catalog, load_catalog
8
+ from sortedcontainers import SortedList
8
9
 
9
10
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -146,19 +147,40 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
146
147
  return load_catalog(name=catalog_name, **catalog_config)
147
148
 
148
149
 
150
+ class TopTableTimings:
151
+ _VALUE_FIELD: str = "timing"
152
+ top_entites: SortedList
153
+ _size: int
154
+
155
+ def __init__(self, size: int = 10):
156
+ self._size = size
157
+ self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
158
+
159
+ def add(self, entity: Dict[str, Any]) -> None:
160
+ if self._VALUE_FIELD not in entity:
161
+ return
162
+ self.top_entites.add(entity)
163
+ if len(self.top_entites) > self._size:
164
+ self.top_entites.pop()
165
+
166
+ def __str__(self) -> str:
167
+ if len(self.top_entites) == 0:
168
+ return "no timings reported"
169
+ return str(list(self.top_entites))
170
+
171
+
149
172
  class TimingClass:
150
- times: List[int]
173
+ times: SortedList
151
174
 
152
175
  def __init__(self):
153
- self.times = []
176
+ self.times = SortedList()
154
177
 
155
- def add_timing(self, t):
156
- self.times.append(t)
178
+ def add_timing(self, t: float) -> None:
179
+ self.times.add(t)
157
180
 
158
- def __str__(self):
181
+ def __str__(self) -> str:
159
182
  if len(self.times) == 0:
160
183
  return "no timings reported"
161
- self.times.sort()
162
184
  total = sum(self.times)
163
185
  avg = total / len(self.times)
164
186
  return str(
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
180
202
  load_table_timings: TimingClass = field(default_factory=TimingClass)
181
203
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
182
204
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
205
+ tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
206
+ tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
207
+ tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
183
208
  listed_namespaces: int = 0
184
209
  total_listed_tables: int = 0
185
210
  tables_listed_per_namespace: TopKDict[str, int] = field(
@@ -201,11 +226,26 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
201
226
  def report_dropped(self, ent_name: str) -> None:
202
227
  self.filtered.append(ent_name)
203
228
 
204
- def report_table_load_time(self, t: float) -> None:
229
+ def report_table_load_time(
230
+ self, t: float, table_name: str, table_metadata_location: str
231
+ ) -> None:
205
232
  self.load_table_timings.add_timing(t)
233
+ self.tables_load_timings.add(
234
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
235
+ )
206
236
 
207
- def report_table_processing_time(self, t: float) -> None:
237
+ def report_table_processing_time(
238
+ self, t: float, table_name: str, table_metadata_location: str
239
+ ) -> None:
208
240
  self.processing_table_timings.add_timing(t)
241
+ self.tables_process_timings.add(
242
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
243
+ )
209
244
 
210
- def report_table_profiling_time(self, t: float) -> None:
245
+ def report_table_profiling_time(
246
+ self, t: float, table_name: str, table_metadata_location: str
247
+ ) -> None:
211
248
  self.profiling_table_timings.add_timing(t)
249
+ self.tables_profile_timings.add(
250
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
251
+ )
@@ -204,7 +204,9 @@ class IcebergProfiler:
204
204
  )
205
205
  dataset_profile.fieldProfiles.append(column_profile)
206
206
  time_taken = timer.elapsed_seconds()
207
- self.report.report_table_profiling_time(time_taken)
207
+ self.report.report_table_profiling_time(
208
+ time_taken, dataset_name, table.metadata_location
209
+ )
208
210
  LOGGER.debug(
209
211
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
210
212
  )
@@ -17,7 +17,7 @@ from datahub.ingestion.api.decorators import (
17
17
  platform_name,
18
18
  support_status,
19
19
  )
20
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
20
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
22
  from datahub.ingestion.source.kafka_connect.common import (
23
23
  CONNECTOR_CLASS,
@@ -94,11 +94,6 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
94
94
  if not jpype.isJVMStarted():
95
95
  jpype.startJVM()
96
96
 
97
- @classmethod
98
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
99
- config = KafkaConnectSourceConfig.parse_obj(config_dict)
100
- return cls(config, ctx)
101
-
102
97
  def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
103
98
  """Get Kafka Connect connectors manifest using REST API.
104
99
  Enrich with lineages metadata.
@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
23
23
  platform_name,
24
24
  support_status,
25
25
  )
26
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
26
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
27
27
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
28
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  StaleEntityRemovalHandler,
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
789
789
 
790
790
  return platform, dbname, schema, platform_instance
791
791
 
792
- @classmethod
793
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
794
- config = MetabaseConfig.parse_obj(config_dict)
795
- return cls(ctx, config)
796
-
797
792
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
798
793
  return [
799
794
  *super().get_workunit_processors(),
@@ -333,8 +333,3 @@ class MLflowSource(Source):
333
333
  aspect=global_tags,
334
334
  )
335
335
  return wu
336
-
337
- @classmethod
338
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
339
- config = MLflowConfig.parse_obj(config_dict)
340
- return cls(ctx, config)
@@ -484,11 +484,6 @@ class NifiSource(Source):
484
484
  def rest_api_base_url(self):
485
485
  return self.config.site_url[: -len("nifi/")] + "nifi-api/"
486
486
 
487
- @classmethod
488
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
489
- config = NifiSourceConfig.parse_obj(config_dict)
490
- return cls(config, ctx)
491
-
492
487
  def get_report(self) -> SourceReport:
493
488
  return self.report
494
489
 
@@ -369,11 +369,6 @@ class RedashSource(Source):
369
369
  else:
370
370
  raise ValueError(f"Failed to connect to {self.config.connect_uri}/api")
371
371
 
372
- @classmethod
373
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
374
- config = RedashConfig.parse_obj(config_dict)
375
- return cls(ctx, config)
376
-
377
372
  def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
378
373
  url = f"/api/data_sources/{data_source_id}"
379
374
  resp = self.client._get(url).json()
@@ -276,6 +276,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
276
276
  "HLLSKETCH": NullType,
277
277
  "TIMETZ": TimeType,
278
278
  "VARBYTE": StringType,
279
+ "SUPER": NullType,
279
280
  }
280
281
 
281
282
  def get_platform_instance_id(self) -> str:
@@ -244,6 +244,11 @@ class SnowflakeV2Config(
244
244
  description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
245
245
  )
246
246
 
247
+ extract_tags_as_structured_properties: bool = Field(
248
+ default=False,
249
+ description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
250
+ )
251
+
247
252
  include_external_url: bool = Field(
248
253
  default=True,
249
254
  description="Whether to populate Snowsight url for Snowflake Objects",
@@ -263,6 +268,14 @@ class SnowflakeV2Config(
263
268
  description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
264
269
  )
265
270
 
271
+ structured_property_pattern: AllowDenyPattern = Field(
272
+ default=AllowDenyPattern.allow_all(),
273
+ description=(
274
+ "List of regex patterns for structured properties to include in ingestion."
275
+ " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
276
+ ),
277
+ )
278
+
266
279
  # This is required since access_history table does not capture whether the table was temporary table.
267
280
  temporary_tables_pattern: List[str] = Field(
268
281
  default=DEFAULT_TEMP_TABLES_PATTERNS,
@@ -45,15 +45,18 @@ class SnowflakeTag:
45
45
  name: str
46
46
  value: str
47
47
 
48
- def display_name(self) -> str:
48
+ def tag_display_name(self) -> str:
49
49
  return f"{self.name}: {self.value}"
50
50
 
51
- def identifier(self) -> str:
51
+ def tag_identifier(self) -> str:
52
52
  return f"{self._id_prefix_as_str()}:{self.value}"
53
53
 
54
54
  def _id_prefix_as_str(self) -> str:
55
55
  return f"{self.database}.{self.schema}.{self.name}"
56
56
 
57
+ def structured_property_identifier(self) -> str:
58
+ return f"snowflake.{self.database}.{self.schema}.{self.name}"
59
+
57
60
 
58
61
  @dataclass
59
62
  class SnowflakeColumn(BaseColumn):
@@ -4,12 +4,14 @@ from typing import Dict, Iterable, List, Optional, Union
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import (
7
+ get_sys_time,
7
8
  make_data_platform_urn,
8
9
  make_dataset_urn_with_platform_instance,
9
10
  make_schema_field_urn,
10
11
  make_tag_urn,
11
12
  )
12
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
+ from datahub.emitter.mcp_builder import add_structured_properties_to_entity_wu
13
15
  from datahub.ingestion.api.source import SourceReport
14
16
  from datahub.ingestion.api.workunit import MetadataWorkUnit
15
17
  from datahub.ingestion.glossary.classification_mixin import (
@@ -72,6 +74,7 @@ from datahub.ingestion.source_report.ingestion_stage import (
72
74
  PROFILING,
73
75
  )
74
76
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
77
+ AuditStamp,
75
78
  GlobalTags,
76
79
  Status,
77
80
  SubTypes,
@@ -98,7 +101,18 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
98
101
  StringType,
99
102
  TimeType,
100
103
  )
104
+ from datahub.metadata.com.linkedin.pegasus2avro.structured import (
105
+ StructuredPropertyDefinition,
106
+ )
101
107
  from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties
108
+ from datahub.metadata.urns import (
109
+ ContainerUrn,
110
+ DatasetUrn,
111
+ DataTypeUrn,
112
+ EntityTypeUrn,
113
+ SchemaFieldUrn,
114
+ StructuredPropertyUrn,
115
+ )
102
116
  from datahub.sql_parsing.sql_parsing_aggregator import (
103
117
  KnownLineageMapping,
104
118
  SqlParsingAggregator,
@@ -673,14 +687,31 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
673
687
  yield from self.gen_dataset_workunits(view, schema_name, db_name)
674
688
 
675
689
  def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
676
- tag_identifier = tag.identifier()
690
+ use_sp = self.config.extract_tags_as_structured_properties
691
+ identifier = (
692
+ self.snowflake_identifier(tag.structured_property_identifier())
693
+ if use_sp
694
+ else tag.tag_identifier()
695
+ )
677
696
 
678
- if self.report.is_tag_processed(tag_identifier):
697
+ if self.report.is_tag_processed(identifier):
679
698
  return
680
699
 
681
- self.report.report_tag_processed(tag_identifier)
682
-
683
- yield from self.gen_tag_workunits(tag)
700
+ self.report.report_tag_processed(identifier)
701
+ if use_sp:
702
+ yield from self.gen_tag_as_structured_property_workunits(tag)
703
+ else:
704
+ yield from self.gen_tag_workunits(tag)
705
+
706
+ def _format_tags_as_structured_properties(
707
+ self, tags: List[SnowflakeTag]
708
+ ) -> Dict[StructuredPropertyUrn, str]:
709
+ return {
710
+ StructuredPropertyUrn(
711
+ self.snowflake_identifier(tag.structured_property_identifier())
712
+ ): tag.value
713
+ for tag in tags
714
+ }
684
715
 
685
716
  def gen_dataset_workunits(
686
717
  self,
@@ -725,6 +756,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
725
756
  env=self.config.env,
726
757
  )
727
758
 
759
+ if self.config.extract_tags_as_structured_properties:
760
+ yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
761
+
728
762
  yield from add_table_to_schema_container(
729
763
  dataset_urn=dataset_urn,
730
764
  parent_container_key=schema_container_key,
@@ -758,16 +792,24 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
758
792
  )
759
793
 
760
794
  if table.tags:
761
- tag_associations = [
762
- TagAssociation(
763
- tag=make_tag_urn(self.snowflake_identifier(tag.identifier()))
795
+ if self.config.extract_tags_as_structured_properties:
796
+ yield from add_structured_properties_to_entity_wu(
797
+ dataset_urn,
798
+ self._format_tags_as_structured_properties(table.tags),
764
799
  )
765
- for tag in table.tags
766
- ]
767
- global_tags = GlobalTags(tag_associations)
768
- yield MetadataChangeProposalWrapper(
769
- entityUrn=dataset_urn, aspect=global_tags
770
- ).as_workunit()
800
+ else:
801
+ tag_associations = [
802
+ TagAssociation(
803
+ tag=make_tag_urn(
804
+ self.snowflake_identifier(tag.tag_identifier())
805
+ )
806
+ )
807
+ for tag in table.tags
808
+ ]
809
+ global_tags = GlobalTags(tag_associations)
810
+ yield MetadataChangeProposalWrapper(
811
+ entityUrn=dataset_urn, aspect=global_tags
812
+ ).as_workunit()
771
813
 
772
814
  if isinstance(table, SnowflakeView) and table.view_definition is not None:
773
815
  view_properties_aspect = ViewProperties(
@@ -840,10 +882,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
840
882
  )
841
883
 
842
884
  def gen_tag_workunits(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]:
843
- tag_urn = make_tag_urn(self.snowflake_identifier(tag.identifier()))
885
+ tag_urn = make_tag_urn(self.snowflake_identifier(tag.tag_identifier()))
844
886
 
845
887
  tag_properties_aspect = TagProperties(
846
- name=tag.display_name(),
888
+ name=tag.tag_display_name(),
847
889
  description=f"Represents the Snowflake tag `{tag._id_prefix_as_str()}` with value `{tag.value}`.",
848
890
  )
849
891
 
@@ -851,6 +893,41 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
851
893
  entityUrn=tag_urn, aspect=tag_properties_aspect
852
894
  ).as_workunit()
853
895
 
896
+ def gen_tag_as_structured_property_workunits(
897
+ self, tag: SnowflakeTag
898
+ ) -> Iterable[MetadataWorkUnit]:
899
+ identifier = self.snowflake_identifier(tag.structured_property_identifier())
900
+ urn = StructuredPropertyUrn(identifier).urn()
901
+ aspect = StructuredPropertyDefinition(
902
+ qualifiedName=identifier,
903
+ displayName=tag.name,
904
+ valueType=DataTypeUrn("datahub.string").urn(),
905
+ entityTypes=[
906
+ EntityTypeUrn(f"datahub.{ContainerUrn.ENTITY_TYPE}").urn(),
907
+ EntityTypeUrn(f"datahub.{DatasetUrn.ENTITY_TYPE}").urn(),
908
+ EntityTypeUrn(f"datahub.{SchemaFieldUrn.ENTITY_TYPE}").urn(),
909
+ ],
910
+ lastModified=AuditStamp(
911
+ time=get_sys_time(), actor="urn:li:corpuser:datahub"
912
+ ),
913
+ )
914
+ yield MetadataChangeProposalWrapper(
915
+ entityUrn=urn,
916
+ aspect=aspect,
917
+ ).as_workunit()
918
+
919
+ def gen_column_tags_as_structured_properties(
920
+ self, dataset_urn: str, table: Union[SnowflakeTable, SnowflakeView]
921
+ ) -> Iterable[MetadataWorkUnit]:
922
+ for column_name in table.column_tags:
923
+ schema_field_urn = SchemaFieldUrn(dataset_urn, column_name).urn()
924
+ yield from add_structured_properties_to_entity_wu(
925
+ schema_field_urn,
926
+ self._format_tags_as_structured_properties(
927
+ table.column_tags[column_name]
928
+ ),
929
+ )
930
+
854
931
  def gen_schema_metadata(
855
932
  self,
856
933
  table: Union[SnowflakeTable, SnowflakeView],
@@ -892,13 +969,14 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
892
969
  [
893
970
  TagAssociation(
894
971
  make_tag_urn(
895
- self.snowflake_identifier(tag.identifier())
972
+ self.snowflake_identifier(tag.tag_identifier())
896
973
  )
897
974
  )
898
975
  for tag in table.column_tags[col.name]
899
976
  ]
900
977
  )
901
978
  if col.name in table.column_tags
979
+ and not self.config.extract_tags_as_structured_properties
902
980
  else None
903
981
  ),
904
982
  )
@@ -985,8 +1063,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
985
1063
  )
986
1064
  ),
987
1065
  tags=(
988
- [self.snowflake_identifier(tag.identifier()) for tag in database.tags]
1066
+ [
1067
+ self.snowflake_identifier(tag.tag_identifier())
1068
+ for tag in database.tags
1069
+ ]
989
1070
  if database.tags
1071
+ and not self.config.extract_tags_as_structured_properties
1072
+ else None
1073
+ ),
1074
+ structured_properties=(
1075
+ self._format_tags_as_structured_properties(database.tags)
1076
+ if database.tags and self.config.extract_tags_as_structured_properties
990
1077
  else None
991
1078
  ),
992
1079
  )
@@ -1038,8 +1125,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1038
1125
  else None
1039
1126
  ),
1040
1127
  tags=(
1041
- [self.snowflake_identifier(tag.identifier()) for tag in schema.tags]
1042
- if schema.tags
1128
+ [self.snowflake_identifier(tag.tag_identifier()) for tag in schema.tags]
1129
+ if schema.tags and not self.config.extract_tags_as_structured_properties
1130
+ else None
1131
+ ),
1132
+ structured_properties=(
1133
+ self._format_tags_as_structured_properties(schema.tags)
1134
+ if schema.tags and self.config.extract_tags_as_structured_properties
1043
1135
  else None
1044
1136
  ),
1045
1137
  )
@@ -165,10 +165,20 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
165
165
 
166
166
  allowed_tags = []
167
167
  for tag in tags:
168
- tag_identifier = tag.identifier()
169
- self.report.report_entity_scanned(tag_identifier, "tag")
170
- if not self.config.tag_pattern.allowed(tag_identifier):
171
- self.report.report_dropped(tag_identifier)
168
+ identifier = (
169
+ tag._id_prefix_as_str()
170
+ if self.config.extract_tags_as_structured_properties
171
+ else tag.tag_identifier()
172
+ )
173
+ self.report.report_entity_scanned(identifier, "tag")
174
+
175
+ pattern = (
176
+ self.config.structured_property_pattern
177
+ if self.config.extract_tags_as_structured_properties
178
+ else self.config.tag_pattern
179
+ )
180
+ if not pattern.allowed(identifier):
181
+ self.report.report_dropped(identifier)
172
182
  else:
173
183
  allowed_tags.append(tag)
174
184
  return allowed_tags
@@ -23,7 +23,6 @@ from datahub.ingestion.api.incremental_properties_helper import (
23
23
  from datahub.ingestion.api.source import (
24
24
  CapabilityReport,
25
25
  MetadataWorkUnitProcessor,
26
- Source,
27
26
  SourceCapability,
28
27
  SourceReport,
29
28
  TestableSource,
@@ -251,11 +250,6 @@ class SnowflakeV2Source(
251
250
 
252
251
  self.add_config_to_report()
253
252
 
254
- @classmethod
255
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
256
- config = SnowflakeV2Config.parse_obj(config_dict)
257
- return cls(ctx, config)
258
-
259
253
  @staticmethod
260
254
  def test_connection(config_dict: dict) -> TestConnectionReport:
261
255
  test_report = TestConnectionReport()
@@ -93,7 +93,7 @@ POSTGRES_TYPES_MAP: Dict[str, Any] = {
93
93
  "regtype": None,
94
94
  "regrole": None,
95
95
  "regnamespace": None,
96
- "super": None,
96
+ "super": NullType,
97
97
  "uuid": StringType,
98
98
  "pg_lsn": None,
99
99
  "tsvector": None, # text search vector
@@ -20,6 +20,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
20
20
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
21
21
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
22
22
  from datahub.metadata.schema_classes import DataPlatformInstanceClass
23
+ from datahub.metadata.urns import StructuredPropertyUrn
23
24
  from datahub.utilities.registries.domain_registry import DomainRegistry
24
25
  from datahub.utilities.urns.dataset_urn import DatasetUrn
25
26
 
@@ -75,6 +76,7 @@ def gen_schema_container(
75
76
  created: Optional[int] = None,
76
77
  last_modified: Optional[int] = None,
77
78
  extra_properties: Optional[Dict[str, str]] = None,
79
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
78
80
  ) -> Iterable[MetadataWorkUnit]:
79
81
  domain_urn: Optional[str] = None
80
82
  if domain_registry:
@@ -99,6 +101,7 @@ def gen_schema_container(
99
101
  owner_urn=owner_urn,
100
102
  qualified_name=qualified_name,
101
103
  extra_properties=extra_properties,
104
+ structured_properties=structured_properties,
102
105
  )
103
106
 
104
107
 
@@ -133,6 +136,7 @@ def gen_database_container(
133
136
  created: Optional[int] = None,
134
137
  last_modified: Optional[int] = None,
135
138
  extra_properties: Optional[Dict[str, str]] = None,
139
+ structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
136
140
  ) -> Iterable[MetadataWorkUnit]:
137
141
  domain_urn: Optional[str] = None
138
142
  if domain_registry:
@@ -154,6 +158,7 @@ def gen_database_container(
154
158
  owner_urn=owner_urn,
155
159
  qualified_name=qualified_name,
156
160
  extra_properties=extra_properties,
161
+ structured_properties=structured_properties,
157
162
  )
158
163
 
159
164
 
@@ -33,7 +33,7 @@ from datahub.ingestion.api.decorators import (
33
33
  platform_name,
34
34
  support_status,
35
35
  )
36
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
36
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
39
  from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
@@ -265,11 +265,6 @@ class SupersetSource(StatefulIngestionSourceBase):
265
265
  # TODO(Gabe): how should we message about this error?
266
266
  return requests_session
267
267
 
268
- @classmethod
269
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
270
- config = SupersetConfig.parse_obj(config_dict)
271
- return cls(ctx, config)
272
-
273
268
  def paginate_entity_api_results(self, entity_type, page_size=100):
274
269
  current_page = 0
275
270
  total_items = page_size
@@ -71,7 +71,6 @@ from datahub.ingestion.api.decorators import (
71
71
  from datahub.ingestion.api.source import (
72
72
  CapabilityReport,
73
73
  MetadataWorkUnitProcessor,
74
- Source,
75
74
  StructuredLogLevel,
76
75
  TestableSource,
77
76
  TestConnectionReport,
@@ -804,11 +803,6 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
804
803
  def get_report(self) -> TableauSourceReport:
805
804
  return self.report
806
805
 
807
- @classmethod
808
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
809
- config = TableauConfig.parse_obj(config_dict)
810
- return cls(config, ctx)
811
-
812
806
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
813
807
  return [
814
808
  *super().get_workunit_processors(),