acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (72) hide show
  1. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/METADATA +2449 -2449
  2. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/RECORD +72 -71
  3. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/docker_cli.py +1 -1
  6. datahub/cli/iceberg_cli.py +1 -1
  7. datahub/cli/ingest_cli.py +3 -1
  8. datahub/cli/lite_cli.py +4 -2
  9. datahub/cli/specific/dataproduct_cli.py +1 -1
  10. datahub/configuration/kafka.py +1 -1
  11. datahub/ingestion/api/source_helpers.py +4 -0
  12. datahub/ingestion/fs/s3_fs.py +2 -2
  13. datahub/ingestion/graph/client.py +15 -6
  14. datahub/ingestion/graph/entity_versioning.py +3 -3
  15. datahub/ingestion/run/pipeline.py +109 -143
  16. datahub/ingestion/run/sink_callback.py +77 -0
  17. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  18. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  19. datahub/ingestion/source/csv_enricher.py +2 -2
  20. datahub/ingestion/source/delta_lake/config.py +8 -1
  21. datahub/ingestion/source/delta_lake/report.py +4 -2
  22. datahub/ingestion/source/delta_lake/source.py +20 -5
  23. datahub/ingestion/source/dremio/dremio_api.py +3 -3
  24. datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
  25. datahub/ingestion/source/elastic_search.py +26 -6
  26. datahub/ingestion/source/feast.py +27 -8
  27. datahub/ingestion/source/file.py +1 -1
  28. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  29. datahub/ingestion/source/identity/okta.py +1 -2
  30. datahub/ingestion/source/kafka/kafka.py +1 -1
  31. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  32. datahub/ingestion/source/looker/lookml_source.py +1 -1
  33. datahub/ingestion/source/metabase.py +54 -32
  34. datahub/ingestion/source/mlflow.py +30 -7
  35. datahub/ingestion/source/mode.py +8 -3
  36. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  37. datahub/ingestion/source/nifi.py +29 -6
  38. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  39. datahub/ingestion/source/pulsar.py +3 -2
  40. datahub/ingestion/source/redash.py +29 -6
  41. datahub/ingestion/source/s3/config.py +3 -1
  42. datahub/ingestion/source/salesforce.py +28 -6
  43. datahub/ingestion/source/sigma/sigma.py +1 -1
  44. datahub/ingestion/source/slack/slack.py +31 -10
  45. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  46. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  47. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  48. datahub/ingestion/source/sql/druid.py +1 -5
  49. datahub/ingestion/source/sql/oracle.py +34 -0
  50. datahub/ingestion/source/tableau/tableau.py +2 -1
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -1
  52. datahub/ingestion/source_config/pulsar.py +3 -1
  53. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  54. datahub/lite/duckdb_lite.py +2 -1
  55. datahub/lite/lite_local.py +1 -1
  56. datahub/lite/lite_util.py +4 -3
  57. datahub/metadata/_schema_classes.py +517 -410
  58. datahub/metadata/_urns/urn_defs.py +1670 -1670
  59. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  60. datahub/metadata/schema.avsc +17362 -17638
  61. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  62. datahub/metadata/schemas/__init__.py +3 -3
  63. datahub/sdk/__init__.py +29 -12
  64. datahub/sdk/_entity.py +18 -1
  65. datahub/sdk/container.py +3 -1
  66. datahub/sdk/dataset.py +5 -3
  67. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  68. datahub/utilities/memory_footprint.py +3 -2
  69. datahub/utilities/unified_diff.py +5 -1
  70. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/LICENSE +0 -0
  71. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/entry_points.txt +0 -0
  72. {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
14
14
 
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.common import AllowDenyPattern
17
- from datahub.configuration.source_common import EnvConfigMixin
17
+ from datahub.configuration.source_common import (
18
+ EnvConfigMixin,
19
+ )
18
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
21
  from datahub.ingestion.api.common import PipelineContext
20
22
  from datahub.ingestion.api.decorators import (
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
25
27
  platform_name,
26
28
  support_status,
27
29
  )
28
- from datahub.ingestion.api.source import Source, SourceReport
30
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
29
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
32
  from datahub.ingestion.source.powerbi_report_server.constants import (
31
33
  API_ENDPOINTS,
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
39
41
  PowerBiReport,
40
42
  Report,
41
43
  )
44
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
45
+ StaleEntityRemovalHandler,
46
+ StaleEntityRemovalSourceReport,
47
+ )
48
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
+ StatefulIngestionConfigBase,
50
+ StatefulIngestionSourceBase,
51
+ )
42
52
  from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
43
53
  from datahub.metadata.schema_classes import (
44
54
  BrowsePathsClass,
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
58
68
  LOGGER = logging.getLogger(__name__)
59
69
 
60
70
 
61
- class PowerBiReportServerAPIConfig(EnvConfigMixin):
71
+ class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
62
72
  username: str = pydantic.Field(description="Windows account username")
63
73
  password: str = pydantic.Field(description="Windows account password")
64
74
  workstation_name: str = pydantic.Field(
@@ -475,7 +485,7 @@ class Mapper:
475
485
 
476
486
 
477
487
  @dataclass
478
- class PowerBiReportServerDashboardSourceReport(SourceReport):
488
+ class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
479
489
  scanned_report: int = 0
480
490
  filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
481
491
 
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
490
500
  @config_class(PowerBiReportServerDashboardSourceConfig)
491
501
  @support_status(SupportStatus.INCUBATING)
492
502
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
493
- class PowerBiReportServerDashboardSource(Source):
503
+ class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
494
504
  """
495
505
  Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
496
506
  It extracts the following:
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
520
530
  def __init__(
521
531
  self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
522
532
  ):
523
- super().__init__(ctx)
533
+ super().__init__(config, ctx)
524
534
  self.source_config = config
535
+ self.ctx = ctx
525
536
  self.report = PowerBiReportServerDashboardSourceReport()
526
537
  self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
527
538
  self.powerbi_client = PowerBiReportServerAPI(self.source_config)
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
532
543
  config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
533
544
  return cls(config, ctx)
534
545
 
546
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
547
+ return [
548
+ *super().get_workunit_processors(),
549
+ StaleEntityRemovalHandler.create(
550
+ self, self.source_config, self.ctx
551
+ ).workunit_processor,
552
+ ]
553
+
535
554
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
536
555
  """
537
556
  Datahub Ingestion framework invoke this method
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
116
116
  def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
117
117
  super().__init__(config, ctx)
118
118
  self.platform: str = "pulsar"
119
+ self.ctx = ctx
119
120
  self.config: PulsarSourceConfig = config
120
121
  self.report: PulsarSourceReport = PulsarSourceReport()
121
122
 
@@ -229,8 +230,8 @@ class PulsarSource(StatefulIngestionSourceBase):
229
230
  self.report.report_warning("HTTPError", message)
230
231
  except requests.exceptions.RequestException as e:
231
232
  raise Exception(
232
- f"An ambiguous exception occurred while handling the request: {e}"
233
- )
233
+ "An ambiguous exception occurred while handling the request"
234
+ ) from e
234
235
 
235
236
  @classmethod
236
237
  def create(cls, config_dict, ctx):
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
12
12
  from urllib3.util.retry import Retry
13
13
 
14
14
  import datahub.emitter.mce_builder as builder
15
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
15
+ from datahub.configuration.common import AllowDenyPattern
16
16
  from datahub.emitter.mce_builder import DEFAULT_ENV
17
17
  from datahub.ingestion.api.common import PipelineContext
18
18
  from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
25
+ from datahub.ingestion.api.source import (
26
+ MetadataWorkUnitProcessor,
27
+ SourceCapability,
28
+ SourceReport,
29
+ )
26
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
32
+ StaleEntityRemovalHandler,
33
+ StaleEntityRemovalSourceReport,
34
+ )
35
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
+ StatefulIngestionConfigBase,
37
+ StatefulIngestionSourceBase,
38
+ )
27
39
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
28
40
  AuditStamp,
29
41
  ChangeAuditStamps,
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
235
247
  return f"{database_name}.{table_name}"
236
248
 
237
249
 
238
- class RedashConfig(ConfigModel):
250
+ class RedashConfig(
251
+ StatefulIngestionConfigBase,
252
+ ):
239
253
  # See the Redash API for details
240
254
  # https://redash.io/help/user-guide/integrations-and-api/api
241
255
  connect_uri: str = Field(
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
277
291
 
278
292
 
279
293
  @dataclass
280
- class RedashSourceReport(SourceReport):
294
+ class RedashSourceReport(StaleEntityRemovalSourceReport):
281
295
  items_scanned: int = 0
282
296
  filtered: LossyList[str] = field(default_factory=LossyList)
283
297
  queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
305
319
  @config_class(RedashConfig)
306
320
  @support_status(SupportStatus.INCUBATING)
307
321
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
308
- class RedashSource(Source):
322
+ class RedashSource(StatefulIngestionSourceBase):
309
323
  """
310
324
  This plugin extracts the following:
311
325
 
@@ -316,8 +330,9 @@ class RedashSource(Source):
316
330
  platform = "redash"
317
331
 
318
332
  def __init__(self, ctx: PipelineContext, config: RedashConfig):
319
- super().__init__(ctx)
333
+ super().__init__(config, ctx)
320
334
  self.config: RedashConfig = config
335
+ self.ctx = ctx
321
336
  self.report: RedashSourceReport = RedashSourceReport()
322
337
 
323
338
  # Handle trailing slash removal
@@ -724,6 +739,14 @@ class RedashSource(Source):
724
739
  def add_config_to_report(self) -> None:
725
740
  self.report.api_page_limit = self.config.api_page_limit
726
741
 
742
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
743
+ return [
744
+ *super().get_workunit_processors(),
745
+ StaleEntityRemovalHandler.create(
746
+ self, self.config, self.ctx
747
+ ).workunit_processor,
748
+ ]
749
+
727
750
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
728
751
  self.validate_connection()
729
752
  self.add_config_to_report()
@@ -5,7 +5,9 @@ import pydantic
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
8
- from datahub.configuration.source_common import DatasetSourceConfigMixin
8
+ from datahub.configuration.source_common import (
9
+ DatasetSourceConfigMixin,
10
+ )
9
11
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
10
12
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
11
13
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
@@ -17,7 +17,9 @@ from datahub.configuration.common import (
17
17
  ConfigModel,
18
18
  ConfigurationError,
19
19
  )
20
- from datahub.configuration.source_common import DatasetSourceConfigMixin
20
+ from datahub.configuration.source_common import (
21
+ DatasetSourceConfigMixin,
22
+ )
21
23
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
22
24
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
23
25
  from datahub.ingestion.api.common import PipelineContext
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
29
31
  platform_name,
30
32
  support_status,
31
33
  )
32
- from datahub.ingestion.api.source import Source, SourceReport
34
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
33
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
34
36
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
37
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
+ StaleEntityRemovalHandler,
39
+ StaleEntityRemovalSourceReport,
40
+ )
41
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
42
+ StatefulIngestionConfigBase,
43
+ StatefulIngestionSourceBase,
44
+ )
35
45
  from datahub.ingestion.source_config.operation_config import (
36
46
  OperationConfig,
37
47
  is_profiling_enabled,
@@ -85,7 +95,10 @@ class SalesforceProfilingConfig(ConfigModel):
85
95
  # TODO - support field level profiling
86
96
 
87
97
 
88
- class SalesforceConfig(DatasetSourceConfigMixin):
98
+ class SalesforceConfig(
99
+ StatefulIngestionConfigBase,
100
+ DatasetSourceConfigMixin,
101
+ ):
89
102
  platform: str = "salesforce"
90
103
 
91
104
  auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
@@ -149,7 +162,7 @@ class SalesforceConfig(DatasetSourceConfigMixin):
149
162
 
150
163
 
151
164
  @dataclass
152
- class SalesforceSourceReport(SourceReport):
165
+ class SalesforceSourceReport(StaleEntityRemovalSourceReport):
153
166
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
154
167
 
155
168
  def report_dropped(self, ent_name: str) -> None:
@@ -214,7 +227,7 @@ FIELD_TYPE_MAPPING = {
214
227
  capability_name=SourceCapability.TAGS,
215
228
  description="Enabled by default",
216
229
  )
217
- class SalesforceSource(Source):
230
+ class SalesforceSource(StatefulIngestionSourceBase):
218
231
  base_url: str
219
232
  config: SalesforceConfig
220
233
  report: SalesforceSourceReport
@@ -223,7 +236,8 @@ class SalesforceSource(Source):
223
236
  fieldCounts: Dict[str, int]
224
237
 
225
238
  def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
226
- super().__init__(ctx)
239
+ super().__init__(config, ctx)
240
+ self.ctx = ctx
227
241
  self.config = config
228
242
  self.report = SalesforceSourceReport()
229
243
  self.session = requests.Session()
@@ -328,6 +342,14 @@ class SalesforceSource(Source):
328
342
  )
329
343
  )
330
344
 
345
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
346
+ return [
347
+ *super().get_workunit_processors(),
348
+ StaleEntityRemovalHandler.create(
349
+ self, self.config, self.ctx
350
+ ).workunit_processor,
351
+ ]
352
+
331
353
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
332
354
  try:
333
355
  sObjects = self.get_salesforce_objects()
@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
124
124
  try:
125
125
  self.sigma_api = SigmaAPI(self.config, self.reporter)
126
126
  except Exception as e:
127
- raise ConfigurationError(f"Unable to connect sigma API. Exception: {e}")
127
+ raise ConfigurationError("Unable to connect sigma API") from e
128
128
 
129
129
  @staticmethod
130
130
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
9
9
  from tenacity.before_sleep import before_sleep_log
10
10
 
11
11
  import datahub.emitter.mce_builder as builder
12
- from datahub.configuration.common import ConfigModel
13
12
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
13
  from datahub.ingestion.api.common import PipelineContext
15
14
  from datahub.ingestion.api.decorators import (
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
18
17
  platform_name,
19
18
  support_status,
20
19
  )
21
- from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.source import (
21
+ MetadataWorkUnitProcessor,
22
+ SourceReport,
23
+ )
22
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
26
+ StaleEntityRemovalHandler,
27
+ StaleEntityRemovalSourceReport,
28
+ )
29
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
30
+ StatefulIngestionConfigBase,
31
+ StatefulIngestionSourceBase,
32
+ )
23
33
  from datahub.metadata.schema_classes import (
24
34
  CorpUserEditableInfoClass,
25
35
  DatasetPropertiesClass,
@@ -44,7 +54,9 @@ class CorpUser:
44
54
  slack_display_name: Optional[str] = None
45
55
 
46
56
 
47
- class SlackSourceConfig(ConfigModel):
57
+ class SlackSourceConfig(
58
+ StatefulIngestionConfigBase,
59
+ ):
48
60
  bot_token: SecretStr = Field(
49
61
  description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
50
62
  )
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
58
70
  default=10,
59
71
  description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
60
72
  )
61
- ingest_public_channels = Field(
73
+ ingest_public_channels: bool = Field(
62
74
  type=bool,
63
75
  default=False,
64
76
  description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
65
77
  )
66
- channels_iteration_limit = Field(
78
+ channels_iteration_limit: int = Field(
67
79
  type=int,
68
80
  default=200,
69
81
  description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
70
82
  )
71
- channel_min_members = Field(
83
+ channel_min_members: int = Field(
72
84
  type=int,
73
85
  default=2,
74
86
  description="Ingest channels with at least this many members.",
75
87
  )
76
- should_ingest_archived_channels = Field(
88
+ should_ingest_archived_channels: bool = Field(
77
89
  type=bool,
78
90
  default=False,
79
91
  description="Whether to ingest archived channels.",
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
81
93
 
82
94
 
83
95
  @dataclass
84
- class SlackSourceReport(SourceReport):
96
+ class SlackSourceReport(StaleEntityRemovalSourceReport):
85
97
  channels_reported: int = 0
86
98
  archived_channels_reported: int = 0
87
99
 
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
92
104
  @platform_name("Slack")
93
105
  @config_class(SlackSourceConfig)
94
106
  @support_status(SupportStatus.TESTING)
95
- class SlackSource(Source):
107
+ class SlackSource(StatefulIngestionSourceBase):
96
108
  def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
109
+ super().__init__(config, ctx)
97
110
  self.ctx = ctx
98
111
  self.config = config
99
- self.report = SlackSourceReport()
112
+ self.report: SlackSourceReport = SlackSourceReport()
100
113
  self.workspace_base_url: Optional[str] = None
101
114
  self.rate_limiter = RateLimiter(
102
115
  max_calls=self.config.api_requests_per_min, period=60
@@ -111,6 +124,14 @@ class SlackSource(Source):
111
124
  def get_slack_client(self) -> WebClient:
112
125
  return WebClient(token=self.config.bot_token.get_secret_value())
113
126
 
127
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
128
+ return [
129
+ *super().get_workunit_processors(),
130
+ StaleEntityRemovalHandler.create(
131
+ self, self.config, self.ctx
132
+ ).workunit_processor,
133
+ ]
134
+
114
135
  def get_workunits_internal(
115
136
  self,
116
137
  ) -> Iterable[MetadataWorkUnit]:
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
312
312
  raise ValueError(
313
313
  f"access_token not found in response {response}. "
314
314
  "Please check your OAuth configuration."
315
- )
315
+ ) from None
316
316
  connect_args = self.get_options()["connect_args"]
317
317
  return snowflake.connector.connect(
318
318
  user=self.username,
@@ -134,10 +134,11 @@ class SnowflakeQuery:
134
134
  clustering_key AS "CLUSTERING_KEY",
135
135
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
136
136
  is_dynamic AS "IS_DYNAMIC",
137
- is_iceberg AS "IS_ICEBERG"
137
+ is_iceberg AS "IS_ICEBERG",
138
+ is_hybrid AS "IS_HYBRID"
138
139
  FROM {db_clause}information_schema.tables t
139
140
  WHERE table_schema != 'INFORMATION_SCHEMA'
140
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
141
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
141
142
  order by table_schema, table_name"""
142
143
 
143
144
  @staticmethod
@@ -156,10 +157,11 @@ class SnowflakeQuery:
156
157
  clustering_key AS "CLUSTERING_KEY",
157
158
  auto_clustering_on AS "AUTO_CLUSTERING_ON",
158
159
  is_dynamic AS "IS_DYNAMIC",
159
- is_iceberg AS "IS_ICEBERG"
160
+ is_iceberg AS "IS_ICEBERG",
161
+ is_hybrid AS "IS_HYBRID"
160
162
  FROM {db_clause}information_schema.tables t
161
163
  where table_schema='{schema_name}'
162
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
164
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
163
165
  order by table_schema, table_name"""
164
166
 
165
167
  @staticmethod
@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
96
96
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
97
97
  is_dynamic: bool = False
98
98
  is_iceberg: bool = False
99
-
100
- @property
101
- def is_hybrid(self) -> bool:
102
- return self.type is not None and self.type == "HYBRID TABLE"
99
+ is_hybrid: bool = False
103
100
 
104
101
  def get_subtype(self) -> DatasetSubTypes:
105
102
  return DatasetSubTypes.TABLE
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
369
366
  clustering_key=table["CLUSTERING_KEY"],
370
367
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
371
368
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
369
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
372
370
  )
373
371
  )
374
372
  return tables
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
395
393
  clustering_key=table["CLUSTERING_KEY"],
396
394
  is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
397
395
  is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
396
+ is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
398
397
  )
399
398
  )
400
399
  return tables
@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
50
50
  """
51
51
 
52
52
  def get_identifier(self, schema: str, table: str) -> str:
53
- return (
54
- f"{self.platform_instance}.{table}"
55
- if self.platform_instance
56
- else f"{table}"
57
- )
53
+ return f"{table}"
58
54
 
59
55
 
60
56
  @platform_name("Druid")
@@ -1,5 +1,6 @@
1
1
  import datetime
2
2
  import logging
3
+ import platform
3
4
  import re
4
5
 
5
6
  # This import verifies that the dependencies are available.
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
85
86
  description="The data dictionary views mode, to extract information about schema objects "
86
87
  "('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
87
88
  )
89
+ # oracledb settings to enable thick mode and client library location
90
+ enable_thick_mode: Optional[bool] = Field(
91
+ default=False,
92
+ description="Connection defaults to thin mode. Set to True to enable thick mode.",
93
+ )
94
+ thick_mode_lib_dir: Optional[str] = Field(
95
+ default=None,
96
+ description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
97
+ "On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
98
+ )
88
99
 
89
100
  @pydantic.validator("service_name")
90
101
  def check_service_name(cls, v, values):
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
100
111
  raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
101
112
  return values
102
113
 
114
+ @pydantic.validator("thick_mode_lib_dir", always=True)
115
+ def check_thick_mode_lib_dir(cls, v, values):
116
+ if (
117
+ v is None
118
+ and values.get("enable_thick_mode")
119
+ and (platform.system() == "Darwin" or platform.system() == "Windows")
120
+ ):
121
+ raise ValueError(
122
+ "Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
123
+ )
124
+ return v
125
+
103
126
  def get_sql_alchemy_url(self):
104
127
  url = super().get_sql_alchemy_url()
105
128
  if self.service_name:
@@ -586,6 +609,17 @@ class OracleSource(SQLAlchemySource):
586
609
  def __init__(self, config, ctx):
587
610
  super().__init__(config, ctx, "oracle")
588
611
 
612
+ # if connecting to oracle with enable_thick_mode, it must be initialized before calling
613
+ # create_engine, which is called in get_inspectors()
614
+ # https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
615
+ if self.config.enable_thick_mode:
616
+ if platform.system() == "Darwin" or platform.system() == "Windows":
617
+ # windows and mac os require lib_dir to be set explicitly
618
+ oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
619
+ else:
620
+ # linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
621
+ oracledb.init_oracle_client()
622
+
589
623
  @classmethod
590
624
  def create(cls, config_dict, ctx):
591
625
  config = OracleConfig.parse_obj(config_dict)
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
1562
1562
  query: str,
1563
1563
  connection_type: str,
1564
1564
  page_size: int,
1565
- query_filter: dict = {},
1565
+ query_filter: Optional[dict] = None,
1566
1566
  ) -> Iterable[dict]:
1567
+ query_filter = query_filter or {}
1567
1568
  query_filter = optimize_query_filter(query_filter)
1568
1569
 
1569
1570
  # Calls the get_connection_object_page function to get the objects,
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
514
514
  }
515
515
 
516
516
 
517
- def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
517
+ def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
518
+ params = params or []
518
519
  tags = [
519
520
  TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
520
521
  for tag in params
@@ -33,7 +33,9 @@ def _is_valid_hostname(hostname: str) -> bool:
33
33
 
34
34
 
35
35
  class PulsarSourceConfig(
36
- StatefulIngestionConfigBase, PlatformInstanceConfigMixin, EnvConfigMixin
36
+ StatefulIngestionConfigBase,
37
+ PlatformInstanceConfigMixin,
38
+ EnvConfigMixin,
37
39
  ):
38
40
  web_service_url: str = Field(
39
41
  default="http://localhost:8080", description="The web URL for the cluster."
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import re
2
3
  from typing import List, Optional, Set, cast
3
4
 
@@ -10,8 +11,11 @@ from datahub.metadata.schema_classes import (
10
11
  OwnershipClass,
11
12
  OwnershipTypeClass,
12
13
  )
14
+ from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
15
+ from datahub.utilities.urns._urn_base import Urn
16
+ from datahub.utilities.urns.error import InvalidUrnError
13
17
 
14
- _USER_URN_PREFIX: str = "urn:li:corpuser:"
18
+ logger = logging.getLogger(__name__)
15
19
 
16
20
 
17
21
  class PatternCleanUpOwnershipConfig(ConfigModel):
@@ -49,6 +53,11 @@ class PatternCleanUpOwnership(OwnershipTransformer):
49
53
  else:
50
54
  return set()
51
55
 
56
+ def _process_owner(self, name: str) -> str:
57
+ for value in self.config.pattern_for_cleanup:
58
+ name = re.sub(value, "", name)
59
+ return name
60
+
52
61
  def transform_aspect(
53
62
  self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
54
63
  ) -> Optional[builder.Aspect]:
@@ -58,14 +67,23 @@ class PatternCleanUpOwnership(OwnershipTransformer):
58
67
  # clean all the owners based on the parameters received from config
59
68
  cleaned_owner_urns: List[str] = []
60
69
  for owner_urn in current_owner_urns:
61
- user_id: str = owner_urn.split(_USER_URN_PREFIX)[1]
62
- for value in self.config.pattern_for_cleanup:
63
- user_id = re.sub(value, "", user_id)
64
-
65
- cleaned_owner_urns.append(_USER_URN_PREFIX + user_id)
70
+ username = ""
71
+ try:
72
+ owner: Urn = Urn.from_string(owner_urn)
73
+ if isinstance(owner, CorpUserUrn):
74
+ username = str(CorpUserUrn(self._process_owner(owner.username)))
75
+ elif isinstance(owner, CorpGroupUrn):
76
+ username = str(CorpGroupUrn(self._process_owner(owner.name)))
77
+ else:
78
+ logger.warning(f"{owner_urn} is not a supported owner type.")
79
+ username = owner_urn
80
+ except InvalidUrnError:
81
+ logger.warning(f"Could not parse {owner_urn} from {entity_urn}")
82
+ username = owner_urn
83
+ cleaned_owner_urns.append(username)
66
84
 
67
85
  ownership_type, ownership_type_urn = builder.validate_ownership_type(
68
- OwnershipTypeClass.DATAOWNER
86
+ OwnershipTypeClass.TECHNICAL_OWNER
69
87
  )
70
88
  owners = [
71
89
  OwnerClass(
@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
284
284
  self,
285
285
  query: str,
286
286
  flavor: SearchFlavor,
287
- aspects: List[str] = [],
287
+ aspects: Optional[List[str]] = None,
288
288
  snippet: bool = True,
289
289
  ) -> Iterable[Searchable]:
290
+ aspects = aspects or []
290
291
  if flavor == SearchFlavor.FREE_TEXT:
291
292
  base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
292
293
  for r in self.duckdb_client.execute(base_query).fetchall():
@@ -90,7 +90,7 @@ class DataHubLiteLocal(Generic[LiteConfig], Closeable, metaclass=ABCMeta):
90
90
  self,
91
91
  query: str,
92
92
  flavor: SearchFlavor,
93
- aspects: List[str] = [],
93
+ aspects: Optional[List[str]] = None,
94
94
  snippet: bool = True,
95
95
  ) -> Iterable[Searchable]:
96
96
  pass