acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (47) hide show
  1. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2415 -2415
  2. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +47 -46
  3. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/ingestion/api/source_helpers.py +4 -0
  7. datahub/ingestion/run/pipeline.py +109 -143
  8. datahub/ingestion/run/sink_callback.py +77 -0
  9. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  10. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  11. datahub/ingestion/source/delta_lake/config.py +8 -1
  12. datahub/ingestion/source/delta_lake/report.py +4 -2
  13. datahub/ingestion/source/delta_lake/source.py +20 -5
  14. datahub/ingestion/source/elastic_search.py +26 -6
  15. datahub/ingestion/source/feast.py +27 -8
  16. datahub/ingestion/source/file.py +1 -1
  17. datahub/ingestion/source/identity/okta.py +1 -2
  18. datahub/ingestion/source/mlflow.py +30 -7
  19. datahub/ingestion/source/mode.py +7 -2
  20. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  21. datahub/ingestion/source/nifi.py +29 -6
  22. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  23. datahub/ingestion/source/pulsar.py +1 -0
  24. datahub/ingestion/source/redash.py +29 -6
  25. datahub/ingestion/source/s3/config.py +3 -1
  26. datahub/ingestion/source/salesforce.py +28 -6
  27. datahub/ingestion/source/slack/slack.py +31 -10
  28. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  30. datahub/ingestion/source/sql/oracle.py +34 -0
  31. datahub/ingestion/source_config/pulsar.py +3 -1
  32. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  33. datahub/metadata/_schema_classes.py +517 -410
  34. datahub/metadata/_urns/urn_defs.py +1670 -1670
  35. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  36. datahub/metadata/schema.avsc +17362 -17638
  37. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  38. datahub/metadata/schemas/__init__.py +3 -3
  39. datahub/sdk/__init__.py +29 -12
  40. datahub/sdk/_entity.py +18 -1
  41. datahub/sdk/container.py +3 -1
  42. datahub/sdk/dataset.py +5 -3
  43. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  44. datahub/utilities/unified_diff.py +5 -1
  45. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  46. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  47. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,9 @@ from datahub.configuration.common import (
17
17
  ConfigModel,
18
18
  ConfigurationError,
19
19
  )
20
- from datahub.configuration.source_common import DatasetSourceConfigMixin
20
+ from datahub.configuration.source_common import (
21
+ DatasetSourceConfigMixin,
22
+ )
21
23
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
22
24
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
23
25
  from datahub.ingestion.api.common import PipelineContext
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
29
31
  platform_name,
30
32
  support_status,
31
33
  )
32
- from datahub.ingestion.api.source import Source, SourceReport
34
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
33
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
34
36
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
37
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
+ StaleEntityRemovalHandler,
39
+ StaleEntityRemovalSourceReport,
40
+ )
41
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
42
+ StatefulIngestionConfigBase,
43
+ StatefulIngestionSourceBase,
44
+ )
35
45
  from datahub.ingestion.source_config.operation_config import (
36
46
  OperationConfig,
37
47
  is_profiling_enabled,
@@ -85,7 +95,10 @@ class SalesforceProfilingConfig(ConfigModel):
85
95
  # TODO - support field level profiling
86
96
 
87
97
 
88
- class SalesforceConfig(DatasetSourceConfigMixin):
98
+ class SalesforceConfig(
99
+ StatefulIngestionConfigBase,
100
+ DatasetSourceConfigMixin,
101
+ ):
89
102
  platform: str = "salesforce"
90
103
 
91
104
  auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
@@ -149,7 +162,7 @@ class SalesforceConfig(DatasetSourceConfigMixin):
149
162
 
150
163
 
151
164
  @dataclass
152
- class SalesforceSourceReport(SourceReport):
165
+ class SalesforceSourceReport(StaleEntityRemovalSourceReport):
153
166
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
154
167
 
155
168
  def report_dropped(self, ent_name: str) -> None:
@@ -214,7 +227,7 @@ FIELD_TYPE_MAPPING = {
214
227
  capability_name=SourceCapability.TAGS,
215
228
  description="Enabled by default",
216
229
  )
217
- class SalesforceSource(Source):
230
+ class SalesforceSource(StatefulIngestionSourceBase):
218
231
  base_url: str
219
232
  config: SalesforceConfig
220
233
  report: SalesforceSourceReport
@@ -223,7 +236,8 @@ class SalesforceSource(Source):
223
236
  fieldCounts: Dict[str, int]
224
237
 
225
238
  def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
226
- super().__init__(ctx)
239
+ super().__init__(config, ctx)
240
+ self.ctx = ctx
227
241
  self.config = config
228
242
  self.report = SalesforceSourceReport()
229
243
  self.session = requests.Session()
@@ -328,6 +342,14 @@ class SalesforceSource(Source):
328
342
  )
329
343
  )
330
344
 
345
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
346
+ return [
347
+ *super().get_workunit_processors(),
348
+ StaleEntityRemovalHandler.create(
349
+ self, self.config, self.ctx
350
+ ).workunit_processor,
351
+ ]
352
+
331
353
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
332
354
  try:
333
355
  sObjects = self.get_salesforce_objects()
@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
9
9
  from tenacity.before_sleep import before_sleep_log
10
10
 
11
11
  import datahub.emitter.mce_builder as builder
12
- from datahub.configuration.common import ConfigModel
13
12
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
13
  from datahub.ingestion.api.common import PipelineContext
15
14
  from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
18
17
  platform_name,
19
18
  support_status,
20
19
  )
21
- from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.source import (
21
+ MetadataWorkUnitProcessor,
22
+ SourceReport,
23
+ )
22
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
26
+ StaleEntityRemovalHandler,
27
+ StaleEntityRemovalSourceReport,
28
+ )
29
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
30
+ StatefulIngestionConfigBase,
31
+ StatefulIngestionSourceBase,
32
+ )
23
33
  from datahub.metadata.schema_classes import (
24
34
  CorpUserEditableInfoClass,
25
35
  DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
44
54
  slack_display_name: Optional[str] = None
45
55
 
46
56
 
47
- class SlackSourceConfig(ConfigModel):
57
+ class SlackSourceConfig(
58
+ StatefulIngestionConfigBase,
59
+ ):
48
60
  bot_token: SecretStr = Field(
49
61
  description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
50
62
  )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
58
70
  default=10,
59
71
  description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
60
72
  )
61
- ingest_public_channels = Field(
73
+ ingest_public_channels: bool = Field(
62
74
  type=bool,
63
75
  default=False,
64
76
  description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
65
77
  )
66
- channels_iteration_limit = Field(
78
+ channels_iteration_limit: int = Field(
67
79
  type=int,
68
80
  default=200,
69
81
  description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
70
82
  )
71
- channel_min_members = Field(
83
+ channel_min_members: int = Field(
72
84
  type=int,
73
85
  default=2,
74
86
  description="Ingest channels with at least this many members.",
75
87
  )
76
- should_ingest_archived_channels = Field(
88
+ should_ingest_archived_channels: bool = Field(
77
89
  type=bool,
78
90
  default=False,
79
91
  description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
81
93
 
82
94
 
83
95
  @dataclass
84
- class SlackSourceReport(SourceReport):
96
+ class SlackSourceReport(StaleEntityRemovalSourceReport):
85
97
  channels_reported: int = 0
86
98
  archived_channels_reported: int = 0
87
99
 
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
92
104
  @platform_name("Slack")
93
105
  @config_class(SlackSourceConfig)
94
106
  @support_status(SupportStatus.TESTING)
95
- class SlackSource(Source):
107
+ class SlackSource(StatefulIngestionSourceBase):
96
108
  def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
109
+ super().__init__(config, ctx)
97
110
  self.ctx = ctx
98
111
  self.config = config
99
- self.report = SlackSourceReport()
112
+ self.report: SlackSourceReport = SlackSourceReport()
100
113
  self.workspace_base_url: Optional[str] = None
101
114
  self.rate_limiter = RateLimiter(
102
115
  max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
111
124
  def get_slack_client(self) -> WebClient:
112
125
  return WebClient(token=self.config.bot_token.get_secret_value())
113
126
 
127
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
128
+ return [
129
+ *super().get_workunit_processors(),
130
+ StaleEntityRemovalHandler.create(
131
+ self, self.config, self.ctx
132
+ ).workunit_processor,
133
+ ]
134
+
114
135
  def get_workunits_internal(
115
136
  self,
116
137
  ) -> Iterable[MetadataWorkUnit]:
@@ -134,10 +134,11 @@ class SnowflakeQuery:
134
134
  clustering_key AS "CLUSTERING_KEY",
135
135
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
136
136
  is_dynamic AS "IS_DYNAMIC",
137
- is_iceberg AS "IS_ICEBERG"
137
+ is_iceberg AS "IS_ICEBERG",
138
+ is_hybrid AS "IS_HYBRID"
138
139
  FROM {db_clause}information_schema.tables t
139
140
  WHERE table_schema != 'INFORMATION_SCHEMA'
140
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
141
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
141
142
  order by table_schema, table_name"""
142
143
 
143
144
  @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
156
157
  clustering_key AS "CLUSTERING_KEY",
157
158
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
158
159
  is_dynamic AS "IS_DYNAMIC",
159
- is_iceberg AS "IS_ICEBERG"
160
+ is_iceberg AS "IS_ICEBERG",
161
+ is_hybrid AS "IS_HYBRID"
160
162
  FROM {db_clause}information_schema.tables t
161
163
  where table_schema='{schema_name}'
162
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
164
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
163
165
  order by table_schema, table_name"""
164
166
 
165
167
  @staticmethod
@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
96
96
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
97
97
  is_dynamic: bool = False
98
98
  is_iceberg: bool = False
99
-
100
- @property
101
- def is_hybrid(self) -> bool:
102
- return self.type is not None and self.type == "HYBRID TABLE"
99
+ is_hybrid: bool = False
103
100
 
104
101
  def get_subtype(self) -> DatasetSubTypes:
105
102
  return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
369
366
  clustering_key=table["CLUSTERING_KEY"],
370
367
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
371
368
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
369
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
372
370
  )
373
371
  )
374
372
  return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
395
393
  clustering_key=table["CLUSTERING_KEY"],
396
394
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
397
395
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
396
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
398
397
  )
399
398
  )
400
399
  return tables
@@ -1,5 +1,6 @@
1
1
  import datetime
2
2
  import logging
3
+ import platform
3
4
  import re
4
5
 
5
6
  # This import verifies that the dependencies are available.
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
85
86
  description="The data dictionary views mode, to extract information about schema objects "
86
87
  "('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
87
88
  )
89
+ # oracledb settings to enable thick mode and client library location
90
+ enable_thick_mode: Optional[bool] = Field(
91
+ default=False,
92
+ description="Connection defaults to thin mode. Set to True to enable thick mode.",
93
+ )
94
+ thick_mode_lib_dir: Optional[str] = Field(
95
+ default=None,
96
+ description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
97
+ "On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
98
+ )
88
99
 
89
100
  @pydantic.validator("service_name")
90
101
  def check_service_name(cls, v, values):
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
100
111
  raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
101
112
  return values
102
113
 
114
+ @pydantic.validator("thick_mode_lib_dir", always=True)
115
+ def check_thick_mode_lib_dir(cls, v, values):
116
+ if (
117
+ v is None
118
+ and values.get("enable_thick_mode")
119
+ and (platform.system() == "Darwin" or platform.system() == "Windows")
120
+ ):
121
+ raise ValueError(
122
+ "Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
123
+ )
124
+ return v
125
+
103
126
  def get_sql_alchemy_url(self):
104
127
  url = super().get_sql_alchemy_url()
105
128
  if self.service_name:
@@ -586,6 +609,17 @@ class OracleSource(SQLAlchemySource):
586
609
  def __init__(self, config, ctx):
587
610
  super().__init__(config, ctx, "oracle")
588
611
 
612
+ # if connecting to oracle with enable_thick_mode, it must be initialized before calling
613
+ # create_engine, which is called in get_inspectors()
614
+ # https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
615
+ if self.config.enable_thick_mode:
616
+ if platform.system() == "Darwin" or platform.system() == "Windows":
617
+ # windows and mac os require lib_dir to be set explicitly
618
+ oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
619
+ else:
620
+ # linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
621
+ oracledb.init_oracle_client()
622
+
589
623
  @classmethod
590
624
  def create(cls, config_dict, ctx):
591
625
  config = OracleConfig.parse_obj(config_dict)
@@ -33,7 +33,9 @@ def _is_valid_hostname(hostname: str) -> bool:
33
33
 
34
34
 
35
35
  class PulsarSourceConfig(
36
- StatefulIngestionConfigBase, PlatformInstanceConfigMixin, EnvConfigMixin
36
+ StatefulIngestionConfigBase,
37
+ PlatformInstanceConfigMixin,
38
+ EnvConfigMixin,
37
39
  ):
38
40
  web_service_url: str = Field(
39
41
  default="http://localhost:8080", description="The web URL for the cluster."
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import re
2
3
  from typing import List, Optional, Set, cast
3
4
 
@@ -10,8 +11,11 @@ from datahub.metadata.schema_classes import (
10
11
  OwnershipClass,
11
12
  OwnershipTypeClass,
12
13
  )
14
+ from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
15
+ from datahub.utilities.urns._urn_base import Urn
16
+ from datahub.utilities.urns.error import InvalidUrnError
13
17
 
14
- _USER_URN_PREFIX: str = "urn:li:corpuser:"
18
+ logger = logging.getLogger(__name__)
15
19
 
16
20
 
17
21
  class PatternCleanUpOwnershipConfig(ConfigModel):
@@ -49,6 +53,11 @@ class PatternCleanUpOwnership(OwnershipTransformer):
49
53
  else:
50
54
  return set()
51
55
 
56
+ def _process_owner(self, name: str) -> str:
57
+ for value in self.config.pattern_for_cleanup:
58
+ name = re.sub(value, "", name)
59
+ return name
60
+
52
61
  def transform_aspect(
53
62
  self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
54
63
  ) -> Optional[builder.Aspect]:
@@ -58,14 +67,23 @@ class PatternCleanUpOwnership(OwnershipTransformer):
58
67
  # clean all the owners based on the parameters received from config
59
68
  cleaned_owner_urns: List[str] = []
60
69
  for owner_urn in current_owner_urns:
61
- user_id: str = owner_urn.split(_USER_URN_PREFIX)[1]
62
- for value in self.config.pattern_for_cleanup:
63
- user_id = re.sub(value, "", user_id)
64
-
65
- cleaned_owner_urns.append(_USER_URN_PREFIX + user_id)
70
+ username = ""
71
+ try:
72
+ owner: Urn = Urn.from_string(owner_urn)
73
+ if isinstance(owner, CorpUserUrn):
74
+ username = str(CorpUserUrn(self._process_owner(owner.username)))
75
+ elif isinstance(owner, CorpGroupUrn):
76
+ username = str(CorpGroupUrn(self._process_owner(owner.name)))
77
+ else:
78
+ logger.warning(f"{owner_urn} is not a supported owner type.")
79
+ username = owner_urn
80
+ except InvalidUrnError:
81
+ logger.warning(f"Could not parse {owner_urn} from {entity_urn}")
82
+ username = owner_urn
83
+ cleaned_owner_urns.append(username)
66
84
 
67
85
  ownership_type, ownership_type_urn = builder.validate_ownership_type(
68
- OwnershipTypeClass.DATAOWNER
86
+ OwnershipTypeClass.TECHNICAL_OWNER
69
87
  )
70
88
  owners = [
71
89
  OwnerClass(