acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/METADATA +2449 -2449
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/RECORD +72 -71
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/configuration/kafka.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/graph/client.py +15 -6
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/csv_enricher.py +2 -2
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +3 -3
- datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/lookml_source.py +1 -1
- datahub/ingestion/source/metabase.py +54 -32
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +8 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/tableau/tableau_common.py +2 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +2 -1
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +517 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17362 -17638
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_entity.py +18 -1
- datahub/sdk/container.py +3 -1
- datahub/sdk/dataset.py +5 -3
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
|
|
|
14
14
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
16
|
from datahub.configuration.common import AllowDenyPattern
|
|
17
|
-
from datahub.configuration.source_common import
|
|
17
|
+
from datahub.configuration.source_common import (
|
|
18
|
+
EnvConfigMixin,
|
|
19
|
+
)
|
|
18
20
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
21
|
from datahub.ingestion.api.common import PipelineContext
|
|
20
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
27
|
platform_name,
|
|
26
28
|
support_status,
|
|
27
29
|
)
|
|
28
|
-
from datahub.ingestion.api.source import
|
|
30
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
29
31
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
32
|
from datahub.ingestion.source.powerbi_report_server.constants import (
|
|
31
33
|
API_ENDPOINTS,
|
|
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
|
|
|
39
41
|
PowerBiReport,
|
|
40
42
|
Report,
|
|
41
43
|
)
|
|
44
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
45
|
+
StaleEntityRemovalHandler,
|
|
46
|
+
StaleEntityRemovalSourceReport,
|
|
47
|
+
)
|
|
48
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
49
|
+
StatefulIngestionConfigBase,
|
|
50
|
+
StatefulIngestionSourceBase,
|
|
51
|
+
)
|
|
42
52
|
from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
|
|
43
53
|
from datahub.metadata.schema_classes import (
|
|
44
54
|
BrowsePathsClass,
|
|
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
|
|
|
58
68
|
LOGGER = logging.getLogger(__name__)
|
|
59
69
|
|
|
60
70
|
|
|
61
|
-
class PowerBiReportServerAPIConfig(EnvConfigMixin):
|
|
71
|
+
class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
62
72
|
username: str = pydantic.Field(description="Windows account username")
|
|
63
73
|
password: str = pydantic.Field(description="Windows account password")
|
|
64
74
|
workstation_name: str = pydantic.Field(
|
|
@@ -475,7 +485,7 @@ class Mapper:
|
|
|
475
485
|
|
|
476
486
|
|
|
477
487
|
@dataclass
|
|
478
|
-
class PowerBiReportServerDashboardSourceReport(
|
|
488
|
+
class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
479
489
|
scanned_report: int = 0
|
|
480
490
|
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
481
491
|
|
|
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
490
500
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
491
501
|
@support_status(SupportStatus.INCUBATING)
|
|
492
502
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
493
|
-
class PowerBiReportServerDashboardSource(
|
|
503
|
+
class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
|
|
494
504
|
"""
|
|
495
505
|
Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
|
|
496
506
|
It extracts the following:
|
|
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
520
530
|
def __init__(
|
|
521
531
|
self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
|
|
522
532
|
):
|
|
523
|
-
super().__init__(ctx)
|
|
533
|
+
super().__init__(config, ctx)
|
|
524
534
|
self.source_config = config
|
|
535
|
+
self.ctx = ctx
|
|
525
536
|
self.report = PowerBiReportServerDashboardSourceReport()
|
|
526
537
|
self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
|
|
527
538
|
self.powerbi_client = PowerBiReportServerAPI(self.source_config)
|
|
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
|
|
|
532
543
|
config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
|
|
533
544
|
return cls(config, ctx)
|
|
534
545
|
|
|
546
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
547
|
+
return [
|
|
548
|
+
*super().get_workunit_processors(),
|
|
549
|
+
StaleEntityRemovalHandler.create(
|
|
550
|
+
self, self.source_config, self.ctx
|
|
551
|
+
).workunit_processor,
|
|
552
|
+
]
|
|
553
|
+
|
|
535
554
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
536
555
|
"""
|
|
537
556
|
Datahub Ingestion framework invoke this method
|
|
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
|
|
|
116
116
|
def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
|
|
117
117
|
super().__init__(config, ctx)
|
|
118
118
|
self.platform: str = "pulsar"
|
|
119
|
+
self.ctx = ctx
|
|
119
120
|
self.config: PulsarSourceConfig = config
|
|
120
121
|
self.report: PulsarSourceReport = PulsarSourceReport()
|
|
121
122
|
|
|
@@ -229,8 +230,8 @@ class PulsarSource(StatefulIngestionSourceBase):
|
|
|
229
230
|
self.report.report_warning("HTTPError", message)
|
|
230
231
|
except requests.exceptions.RequestException as e:
|
|
231
232
|
raise Exception(
|
|
232
|
-
|
|
233
|
-
)
|
|
233
|
+
"An ambiguous exception occurred while handling the request"
|
|
234
|
+
) from e
|
|
234
235
|
|
|
235
236
|
@classmethod
|
|
236
237
|
def create(cls, config_dict, ctx):
|
|
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
|
|
|
12
12
|
from urllib3.util.retry import Retry
|
|
13
13
|
|
|
14
14
|
import datahub.emitter.mce_builder as builder
|
|
15
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
15
|
+
from datahub.configuration.common import AllowDenyPattern
|
|
16
16
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
17
17
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
18
|
from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|
|
22
22
|
platform_name,
|
|
23
23
|
support_status,
|
|
24
24
|
)
|
|
25
|
-
from datahub.ingestion.api.source import
|
|
25
|
+
from datahub.ingestion.api.source import (
|
|
26
|
+
MetadataWorkUnitProcessor,
|
|
27
|
+
SourceCapability,
|
|
28
|
+
SourceReport,
|
|
29
|
+
)
|
|
26
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
32
|
+
StaleEntityRemovalHandler,
|
|
33
|
+
StaleEntityRemovalSourceReport,
|
|
34
|
+
)
|
|
35
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
36
|
+
StatefulIngestionConfigBase,
|
|
37
|
+
StatefulIngestionSourceBase,
|
|
38
|
+
)
|
|
27
39
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
28
40
|
AuditStamp,
|
|
29
41
|
ChangeAuditStamps,
|
|
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
|
|
|
235
247
|
return f"{database_name}.{table_name}"
|
|
236
248
|
|
|
237
249
|
|
|
238
|
-
class RedashConfig(
|
|
250
|
+
class RedashConfig(
|
|
251
|
+
StatefulIngestionConfigBase,
|
|
252
|
+
):
|
|
239
253
|
# See the Redash API for details
|
|
240
254
|
# https://redash.io/help/user-guide/integrations-and-api/api
|
|
241
255
|
connect_uri: str = Field(
|
|
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
|
|
|
277
291
|
|
|
278
292
|
|
|
279
293
|
@dataclass
|
|
280
|
-
class RedashSourceReport(
|
|
294
|
+
class RedashSourceReport(StaleEntityRemovalSourceReport):
|
|
281
295
|
items_scanned: int = 0
|
|
282
296
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
283
297
|
queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
|
|
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
|
|
|
305
319
|
@config_class(RedashConfig)
|
|
306
320
|
@support_status(SupportStatus.INCUBATING)
|
|
307
321
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
308
|
-
class RedashSource(
|
|
322
|
+
class RedashSource(StatefulIngestionSourceBase):
|
|
309
323
|
"""
|
|
310
324
|
This plugin extracts the following:
|
|
311
325
|
|
|
@@ -316,8 +330,9 @@ class RedashSource(Source):
|
|
|
316
330
|
platform = "redash"
|
|
317
331
|
|
|
318
332
|
def __init__(self, ctx: PipelineContext, config: RedashConfig):
|
|
319
|
-
super().__init__(ctx)
|
|
333
|
+
super().__init__(config, ctx)
|
|
320
334
|
self.config: RedashConfig = config
|
|
335
|
+
self.ctx = ctx
|
|
321
336
|
self.report: RedashSourceReport = RedashSourceReport()
|
|
322
337
|
|
|
323
338
|
# Handle trailing slash removal
|
|
@@ -724,6 +739,14 @@ class RedashSource(Source):
|
|
|
724
739
|
def add_config_to_report(self) -> None:
|
|
725
740
|
self.report.api_page_limit = self.config.api_page_limit
|
|
726
741
|
|
|
742
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
743
|
+
return [
|
|
744
|
+
*super().get_workunit_processors(),
|
|
745
|
+
StaleEntityRemovalHandler.create(
|
|
746
|
+
self, self.config, self.ctx
|
|
747
|
+
).workunit_processor,
|
|
748
|
+
]
|
|
749
|
+
|
|
727
750
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
728
751
|
self.validate_connection()
|
|
729
752
|
self.add_config_to_report()
|
|
@@ -5,7 +5,9 @@ import pydantic
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
8
|
-
from datahub.configuration.source_common import
|
|
8
|
+
from datahub.configuration.source_common import (
|
|
9
|
+
DatasetSourceConfigMixin,
|
|
10
|
+
)
|
|
9
11
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
10
12
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
11
13
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
@@ -17,7 +17,9 @@ from datahub.configuration.common import (
|
|
|
17
17
|
ConfigModel,
|
|
18
18
|
ConfigurationError,
|
|
19
19
|
)
|
|
20
|
-
from datahub.configuration.source_common import
|
|
20
|
+
from datahub.configuration.source_common import (
|
|
21
|
+
DatasetSourceConfigMixin,
|
|
22
|
+
)
|
|
21
23
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
22
24
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
23
25
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -29,9 +31,17 @@ from datahub.ingestion.api.decorators import (
|
|
|
29
31
|
platform_name,
|
|
30
32
|
support_status,
|
|
31
33
|
)
|
|
32
|
-
from datahub.ingestion.api.source import
|
|
34
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
33
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
34
36
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
37
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
|
+
StaleEntityRemovalHandler,
|
|
39
|
+
StaleEntityRemovalSourceReport,
|
|
40
|
+
)
|
|
41
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
42
|
+
StatefulIngestionConfigBase,
|
|
43
|
+
StatefulIngestionSourceBase,
|
|
44
|
+
)
|
|
35
45
|
from datahub.ingestion.source_config.operation_config import (
|
|
36
46
|
OperationConfig,
|
|
37
47
|
is_profiling_enabled,
|
|
@@ -85,7 +95,10 @@ class SalesforceProfilingConfig(ConfigModel):
|
|
|
85
95
|
# TODO - support field level profiling
|
|
86
96
|
|
|
87
97
|
|
|
88
|
-
class SalesforceConfig(
|
|
98
|
+
class SalesforceConfig(
|
|
99
|
+
StatefulIngestionConfigBase,
|
|
100
|
+
DatasetSourceConfigMixin,
|
|
101
|
+
):
|
|
89
102
|
platform: str = "salesforce"
|
|
90
103
|
|
|
91
104
|
auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD
|
|
@@ -149,7 +162,7 @@ class SalesforceConfig(DatasetSourceConfigMixin):
|
|
|
149
162
|
|
|
150
163
|
|
|
151
164
|
@dataclass
|
|
152
|
-
class SalesforceSourceReport(
|
|
165
|
+
class SalesforceSourceReport(StaleEntityRemovalSourceReport):
|
|
153
166
|
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
154
167
|
|
|
155
168
|
def report_dropped(self, ent_name: str) -> None:
|
|
@@ -214,7 +227,7 @@ FIELD_TYPE_MAPPING = {
|
|
|
214
227
|
capability_name=SourceCapability.TAGS,
|
|
215
228
|
description="Enabled by default",
|
|
216
229
|
)
|
|
217
|
-
class SalesforceSource(
|
|
230
|
+
class SalesforceSource(StatefulIngestionSourceBase):
|
|
218
231
|
base_url: str
|
|
219
232
|
config: SalesforceConfig
|
|
220
233
|
report: SalesforceSourceReport
|
|
@@ -223,7 +236,8 @@ class SalesforceSource(Source):
|
|
|
223
236
|
fieldCounts: Dict[str, int]
|
|
224
237
|
|
|
225
238
|
def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None:
|
|
226
|
-
super().__init__(ctx)
|
|
239
|
+
super().__init__(config, ctx)
|
|
240
|
+
self.ctx = ctx
|
|
227
241
|
self.config = config
|
|
228
242
|
self.report = SalesforceSourceReport()
|
|
229
243
|
self.session = requests.Session()
|
|
@@ -328,6 +342,14 @@ class SalesforceSource(Source):
|
|
|
328
342
|
)
|
|
329
343
|
)
|
|
330
344
|
|
|
345
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
346
|
+
return [
|
|
347
|
+
*super().get_workunit_processors(),
|
|
348
|
+
StaleEntityRemovalHandler.create(
|
|
349
|
+
self, self.config, self.ctx
|
|
350
|
+
).workunit_processor,
|
|
351
|
+
]
|
|
352
|
+
|
|
331
353
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
332
354
|
try:
|
|
333
355
|
sObjects = self.get_salesforce_objects()
|
|
@@ -124,7 +124,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
124
124
|
try:
|
|
125
125
|
self.sigma_api = SigmaAPI(self.config, self.reporter)
|
|
126
126
|
except Exception as e:
|
|
127
|
-
raise ConfigurationError(
|
|
127
|
+
raise ConfigurationError("Unable to connect sigma API") from e
|
|
128
128
|
|
|
129
129
|
@staticmethod
|
|
130
130
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -9,7 +9,6 @@ from tenacity import retry, wait_exponential
|
|
|
9
9
|
from tenacity.before_sleep import before_sleep_log
|
|
10
10
|
|
|
11
11
|
import datahub.emitter.mce_builder as builder
|
|
12
|
-
from datahub.configuration.common import ConfigModel
|
|
13
12
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
14
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
15
14
|
from datahub.ingestion.api.decorators import (
|
|
@@ -18,8 +17,19 @@ from datahub.ingestion.api.decorators import (
|
|
|
18
17
|
platform_name,
|
|
19
18
|
support_status,
|
|
20
19
|
)
|
|
21
|
-
from datahub.ingestion.api.source import
|
|
20
|
+
from datahub.ingestion.api.source import (
|
|
21
|
+
MetadataWorkUnitProcessor,
|
|
22
|
+
SourceReport,
|
|
23
|
+
)
|
|
22
24
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
26
|
+
StaleEntityRemovalHandler,
|
|
27
|
+
StaleEntityRemovalSourceReport,
|
|
28
|
+
)
|
|
29
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
30
|
+
StatefulIngestionConfigBase,
|
|
31
|
+
StatefulIngestionSourceBase,
|
|
32
|
+
)
|
|
23
33
|
from datahub.metadata.schema_classes import (
|
|
24
34
|
CorpUserEditableInfoClass,
|
|
25
35
|
DatasetPropertiesClass,
|
|
@@ -44,7 +54,9 @@ class CorpUser:
|
|
|
44
54
|
slack_display_name: Optional[str] = None
|
|
45
55
|
|
|
46
56
|
|
|
47
|
-
class SlackSourceConfig(
|
|
57
|
+
class SlackSourceConfig(
|
|
58
|
+
StatefulIngestionConfigBase,
|
|
59
|
+
):
|
|
48
60
|
bot_token: SecretStr = Field(
|
|
49
61
|
description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
|
|
50
62
|
)
|
|
@@ -58,22 +70,22 @@ class SlackSourceConfig(ConfigModel):
|
|
|
58
70
|
default=10,
|
|
59
71
|
description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
|
|
60
72
|
)
|
|
61
|
-
ingest_public_channels = Field(
|
|
73
|
+
ingest_public_channels: bool = Field(
|
|
62
74
|
type=bool,
|
|
63
75
|
default=False,
|
|
64
76
|
description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
|
|
65
77
|
)
|
|
66
|
-
channels_iteration_limit = Field(
|
|
78
|
+
channels_iteration_limit: int = Field(
|
|
67
79
|
type=int,
|
|
68
80
|
default=200,
|
|
69
81
|
description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
|
|
70
82
|
)
|
|
71
|
-
channel_min_members = Field(
|
|
83
|
+
channel_min_members: int = Field(
|
|
72
84
|
type=int,
|
|
73
85
|
default=2,
|
|
74
86
|
description="Ingest channels with at least this many members.",
|
|
75
87
|
)
|
|
76
|
-
should_ingest_archived_channels = Field(
|
|
88
|
+
should_ingest_archived_channels: bool = Field(
|
|
77
89
|
type=bool,
|
|
78
90
|
default=False,
|
|
79
91
|
description="Whether to ingest archived channels.",
|
|
@@ -81,7 +93,7 @@ class SlackSourceConfig(ConfigModel):
|
|
|
81
93
|
|
|
82
94
|
|
|
83
95
|
@dataclass
|
|
84
|
-
class SlackSourceReport(
|
|
96
|
+
class SlackSourceReport(StaleEntityRemovalSourceReport):
|
|
85
97
|
channels_reported: int = 0
|
|
86
98
|
archived_channels_reported: int = 0
|
|
87
99
|
|
|
@@ -92,11 +104,12 @@ PLATFORM_NAME = "slack"
|
|
|
92
104
|
@platform_name("Slack")
|
|
93
105
|
@config_class(SlackSourceConfig)
|
|
94
106
|
@support_status(SupportStatus.TESTING)
|
|
95
|
-
class SlackSource(
|
|
107
|
+
class SlackSource(StatefulIngestionSourceBase):
|
|
96
108
|
def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
|
|
109
|
+
super().__init__(config, ctx)
|
|
97
110
|
self.ctx = ctx
|
|
98
111
|
self.config = config
|
|
99
|
-
self.report = SlackSourceReport()
|
|
112
|
+
self.report: SlackSourceReport = SlackSourceReport()
|
|
100
113
|
self.workspace_base_url: Optional[str] = None
|
|
101
114
|
self.rate_limiter = RateLimiter(
|
|
102
115
|
max_calls=self.config.api_requests_per_min, period=60
|
|
@@ -111,6 +124,14 @@ class SlackSource(Source):
|
|
|
111
124
|
def get_slack_client(self) -> WebClient:
|
|
112
125
|
return WebClient(token=self.config.bot_token.get_secret_value())
|
|
113
126
|
|
|
127
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
128
|
+
return [
|
|
129
|
+
*super().get_workunit_processors(),
|
|
130
|
+
StaleEntityRemovalHandler.create(
|
|
131
|
+
self, self.config, self.ctx
|
|
132
|
+
).workunit_processor,
|
|
133
|
+
]
|
|
134
|
+
|
|
114
135
|
def get_workunits_internal(
|
|
115
136
|
self,
|
|
116
137
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -312,7 +312,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
312
312
|
raise ValueError(
|
|
313
313
|
f"access_token not found in response {response}. "
|
|
314
314
|
"Please check your OAuth configuration."
|
|
315
|
-
)
|
|
315
|
+
) from None
|
|
316
316
|
connect_args = self.get_options()["connect_args"]
|
|
317
317
|
return snowflake.connector.connect(
|
|
318
318
|
user=self.username,
|
|
@@ -134,10 +134,11 @@ class SnowflakeQuery:
|
|
|
134
134
|
clustering_key AS "CLUSTERING_KEY",
|
|
135
135
|
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
136
136
|
is_dynamic AS "IS_DYNAMIC",
|
|
137
|
-
is_iceberg AS "IS_ICEBERG"
|
|
137
|
+
is_iceberg AS "IS_ICEBERG",
|
|
138
|
+
is_hybrid AS "IS_HYBRID"
|
|
138
139
|
FROM {db_clause}information_schema.tables t
|
|
139
140
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
140
|
-
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE'
|
|
141
|
+
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
|
|
141
142
|
order by table_schema, table_name"""
|
|
142
143
|
|
|
143
144
|
@staticmethod
|
|
@@ -156,10 +157,11 @@ class SnowflakeQuery:
|
|
|
156
157
|
clustering_key AS "CLUSTERING_KEY",
|
|
157
158
|
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
158
159
|
is_dynamic AS "IS_DYNAMIC",
|
|
159
|
-
is_iceberg AS "IS_ICEBERG"
|
|
160
|
+
is_iceberg AS "IS_ICEBERG",
|
|
161
|
+
is_hybrid AS "IS_HYBRID"
|
|
160
162
|
FROM {db_clause}information_schema.tables t
|
|
161
163
|
where table_schema='{schema_name}'
|
|
162
|
-
and table_type in ('BASE TABLE', 'EXTERNAL TABLE'
|
|
164
|
+
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
163
165
|
order by table_schema, table_name"""
|
|
164
166
|
|
|
165
167
|
@staticmethod
|
|
@@ -96,10 +96,7 @@ class SnowflakeTable(BaseTable):
|
|
|
96
96
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
97
97
|
is_dynamic: bool = False
|
|
98
98
|
is_iceberg: bool = False
|
|
99
|
-
|
|
100
|
-
@property
|
|
101
|
-
def is_hybrid(self) -> bool:
|
|
102
|
-
return self.type is not None and self.type == "HYBRID TABLE"
|
|
99
|
+
is_hybrid: bool = False
|
|
103
100
|
|
|
104
101
|
def get_subtype(self) -> DatasetSubTypes:
|
|
105
102
|
return DatasetSubTypes.TABLE
|
|
@@ -369,6 +366,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
369
366
|
clustering_key=table["CLUSTERING_KEY"],
|
|
370
367
|
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
371
368
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
369
|
+
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
372
370
|
)
|
|
373
371
|
)
|
|
374
372
|
return tables
|
|
@@ -395,6 +393,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
395
393
|
clustering_key=table["CLUSTERING_KEY"],
|
|
396
394
|
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
397
395
|
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
396
|
+
is_hybrid=table.get("IS_HYBRID", "NO").upper() == "YES",
|
|
398
397
|
)
|
|
399
398
|
)
|
|
400
399
|
return tables
|
|
@@ -50,11 +50,7 @@ class DruidConfig(BasicSQLAlchemyConfig):
|
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
52
|
def get_identifier(self, schema: str, table: str) -> str:
|
|
53
|
-
return
|
|
54
|
-
f"{self.platform_instance}.{table}"
|
|
55
|
-
if self.platform_instance
|
|
56
|
-
else f"{table}"
|
|
57
|
-
)
|
|
53
|
+
return f"{table}"
|
|
58
54
|
|
|
59
55
|
|
|
60
56
|
@platform_name("Druid")
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
|
+
import platform
|
|
3
4
|
import re
|
|
4
5
|
|
|
5
6
|
# This import verifies that the dependencies are available.
|
|
@@ -85,6 +86,16 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
85
86
|
description="The data dictionary views mode, to extract information about schema objects "
|
|
86
87
|
"('ALL' and 'DBA' views are supported). (https://docs.oracle.com/cd/E11882_01/nav/catalog_views.htm)",
|
|
87
88
|
)
|
|
89
|
+
# oracledb settings to enable thick mode and client library location
|
|
90
|
+
enable_thick_mode: Optional[bool] = Field(
|
|
91
|
+
default=False,
|
|
92
|
+
description="Connection defaults to thin mode. Set to True to enable thick mode.",
|
|
93
|
+
)
|
|
94
|
+
thick_mode_lib_dir: Optional[str] = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
description="If using thick mode on Windows or Mac, set thick_mode_lib_dir to the oracle client libraries path. "
|
|
97
|
+
"On Linux, this value is ignored, as ldconfig or LD_LIBRARY_PATH will define the location.",
|
|
98
|
+
)
|
|
88
99
|
|
|
89
100
|
@pydantic.validator("service_name")
|
|
90
101
|
def check_service_name(cls, v, values):
|
|
@@ -100,6 +111,18 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
100
111
|
raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
|
|
101
112
|
return values
|
|
102
113
|
|
|
114
|
+
@pydantic.validator("thick_mode_lib_dir", always=True)
|
|
115
|
+
def check_thick_mode_lib_dir(cls, v, values):
|
|
116
|
+
if (
|
|
117
|
+
v is None
|
|
118
|
+
and values.get("enable_thick_mode")
|
|
119
|
+
and (platform.system() == "Darwin" or platform.system() == "Windows")
|
|
120
|
+
):
|
|
121
|
+
raise ValueError(
|
|
122
|
+
"Specify 'thick_mode_lib_dir' on Mac/Windows when enable_thick_mode is true"
|
|
123
|
+
)
|
|
124
|
+
return v
|
|
125
|
+
|
|
103
126
|
def get_sql_alchemy_url(self):
|
|
104
127
|
url = super().get_sql_alchemy_url()
|
|
105
128
|
if self.service_name:
|
|
@@ -586,6 +609,17 @@ class OracleSource(SQLAlchemySource):
|
|
|
586
609
|
def __init__(self, config, ctx):
|
|
587
610
|
super().__init__(config, ctx, "oracle")
|
|
588
611
|
|
|
612
|
+
# if connecting to oracle with enable_thick_mode, it must be initialized before calling
|
|
613
|
+
# create_engine, which is called in get_inspectors()
|
|
614
|
+
# https://python-oracledb.readthedocs.io/en/latest/user_guide/initialization.html#enabling-python-oracledb-thick-mode
|
|
615
|
+
if self.config.enable_thick_mode:
|
|
616
|
+
if platform.system() == "Darwin" or platform.system() == "Windows":
|
|
617
|
+
# windows and mac os require lib_dir to be set explicitly
|
|
618
|
+
oracledb.init_oracle_client(lib_dir=self.config.thick_mode_lib_dir)
|
|
619
|
+
else:
|
|
620
|
+
# linux requires configurating the library path with ldconfig or LD_LIBRARY_PATH
|
|
621
|
+
oracledb.init_oracle_client()
|
|
622
|
+
|
|
589
623
|
@classmethod
|
|
590
624
|
def create(cls, config_dict, ctx):
|
|
591
625
|
config = OracleConfig.parse_obj(config_dict)
|
|
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
|
|
|
1562
1562
|
query: str,
|
|
1563
1563
|
connection_type: str,
|
|
1564
1564
|
page_size: int,
|
|
1565
|
-
query_filter: dict =
|
|
1565
|
+
query_filter: Optional[dict] = None,
|
|
1566
1566
|
) -> Iterable[dict]:
|
|
1567
|
+
query_filter = query_filter or {}
|
|
1567
1568
|
query_filter = optimize_query_filter(query_filter)
|
|
1568
1569
|
|
|
1569
1570
|
# Calls the get_connection_object_page function to get the objects,
|
|
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
|
|
|
514
514
|
}
|
|
515
515
|
|
|
516
516
|
|
|
517
|
-
def get_tags_from_params(params: List[str] =
|
|
517
|
+
def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
|
|
518
|
+
params = params or []
|
|
518
519
|
tags = [
|
|
519
520
|
TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
|
|
520
521
|
for tag in params
|
|
@@ -33,7 +33,9 @@ def _is_valid_hostname(hostname: str) -> bool:
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class PulsarSourceConfig(
|
|
36
|
-
StatefulIngestionConfigBase,
|
|
36
|
+
StatefulIngestionConfigBase,
|
|
37
|
+
PlatformInstanceConfigMixin,
|
|
38
|
+
EnvConfigMixin,
|
|
37
39
|
):
|
|
38
40
|
web_service_url: str = Field(
|
|
39
41
|
default="http://localhost:8080", description="The web URL for the cluster."
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from typing import List, Optional, Set, cast
|
|
3
4
|
|
|
@@ -10,8 +11,11 @@ from datahub.metadata.schema_classes import (
|
|
|
10
11
|
OwnershipClass,
|
|
11
12
|
OwnershipTypeClass,
|
|
12
13
|
)
|
|
14
|
+
from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
|
|
15
|
+
from datahub.utilities.urns._urn_base import Urn
|
|
16
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
13
17
|
|
|
14
|
-
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class PatternCleanUpOwnershipConfig(ConfigModel):
|
|
@@ -49,6 +53,11 @@ class PatternCleanUpOwnership(OwnershipTransformer):
|
|
|
49
53
|
else:
|
|
50
54
|
return set()
|
|
51
55
|
|
|
56
|
+
def _process_owner(self, name: str) -> str:
|
|
57
|
+
for value in self.config.pattern_for_cleanup:
|
|
58
|
+
name = re.sub(value, "", name)
|
|
59
|
+
return name
|
|
60
|
+
|
|
52
61
|
def transform_aspect(
|
|
53
62
|
self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect]
|
|
54
63
|
) -> Optional[builder.Aspect]:
|
|
@@ -58,14 +67,23 @@ class PatternCleanUpOwnership(OwnershipTransformer):
|
|
|
58
67
|
# clean all the owners based on the parameters received from config
|
|
59
68
|
cleaned_owner_urns: List[str] = []
|
|
60
69
|
for owner_urn in current_owner_urns:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
username = ""
|
|
71
|
+
try:
|
|
72
|
+
owner: Urn = Urn.from_string(owner_urn)
|
|
73
|
+
if isinstance(owner, CorpUserUrn):
|
|
74
|
+
username = str(CorpUserUrn(self._process_owner(owner.username)))
|
|
75
|
+
elif isinstance(owner, CorpGroupUrn):
|
|
76
|
+
username = str(CorpGroupUrn(self._process_owner(owner.name)))
|
|
77
|
+
else:
|
|
78
|
+
logger.warning(f"{owner_urn} is not a supported owner type.")
|
|
79
|
+
username = owner_urn
|
|
80
|
+
except InvalidUrnError:
|
|
81
|
+
logger.warning(f"Could not parse {owner_urn} from {entity_urn}")
|
|
82
|
+
username = owner_urn
|
|
83
|
+
cleaned_owner_urns.append(username)
|
|
66
84
|
|
|
67
85
|
ownership_type, ownership_type_urn = builder.validate_ownership_type(
|
|
68
|
-
OwnershipTypeClass.
|
|
86
|
+
OwnershipTypeClass.TECHNICAL_OWNER
|
|
69
87
|
)
|
|
70
88
|
owners = [
|
|
71
89
|
OwnerClass(
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -284,9 +284,10 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
284
284
|
self,
|
|
285
285
|
query: str,
|
|
286
286
|
flavor: SearchFlavor,
|
|
287
|
-
aspects: List[str] =
|
|
287
|
+
aspects: Optional[List[str]] = None,
|
|
288
288
|
snippet: bool = True,
|
|
289
289
|
) -> Iterable[Searchable]:
|
|
290
|
+
aspects = aspects or []
|
|
290
291
|
if flavor == SearchFlavor.FREE_TEXT:
|
|
291
292
|
base_query = f"SELECT distinct(urn), 'urn', NULL from metadata_aspect_v2 where urn ILIKE '%{query}%' UNION SELECT urn, aspect_name, metadata from metadata_aspect_v2 where metadata->>'$.name' ILIKE '%{query}%'"
|
|
292
293
|
for r in self.duckdb_client.execute(base_query).fetchall():
|
datahub/lite/lite_local.py
CHANGED