acryl-datahub 0.15.0.4rc3__py3-none-any.whl → 0.15.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2507 -2470
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +95 -86
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
- datahub/__init__.py +1 -25
- datahub/_version.py +13 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
- datahub/cli/check_cli.py +1 -1
- datahub/cli/cli_utils.py +3 -3
- datahub/cli/container_cli.py +1 -64
- datahub/cli/iceberg_cli.py +707 -0
- datahub/cli/ingest_cli.py +2 -2
- datahub/emitter/composite_emitter.py +36 -0
- datahub/emitter/rest_emitter.py +1 -1
- datahub/entrypoints.py +26 -5
- datahub/ingestion/api/incremental_lineage_helper.py +4 -0
- datahub/ingestion/api/registry.py +1 -1
- datahub/ingestion/glossary/classification_mixin.py +6 -0
- datahub/ingestion/glossary/classifier.py +3 -2
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/run/connection.py +1 -1
- datahub/ingestion/run/pipeline.py +3 -3
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/apply/__init__.py +0 -0
- datahub/ingestion/source/apply/datahub_apply.py +223 -0
- datahub/ingestion/source/aws/glue.py +5 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/dbt/dbt_core.py +1 -1
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/azure_ad.py +6 -14
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/looker_config.py +3 -1
- datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
- datahub/ingestion/source/looker/looker_file_loader.py +14 -3
- datahub/ingestion/source/looker/looker_template_language.py +104 -14
- datahub/ingestion/source/looker/lookml_config.py +29 -8
- datahub/ingestion/source/looker/lookml_source.py +110 -22
- datahub/ingestion/source/mode.py +2 -4
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
- datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
- datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
- datahub/ingestion/source/sql/clickhouse.py +5 -43
- datahub/ingestion/source/sql/mssql/job_models.py +37 -8
- datahub/ingestion/source/sql/mssql/source.py +17 -0
- datahub/ingestion/source/sql/sql_config.py +0 -10
- datahub/ingestion/source/tableau/tableau.py +16 -13
- datahub/ingestion/source/tableau/tableau_common.py +1 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/proxy.py +2 -2
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_config/operation_config.py +9 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- datahub/metadata/_schema_classes.py +304 -6
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
- datahub/metadata/schema.avsc +211 -12
- datahub/metadata/schemas/AssertionInfo.avsc +2 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
- datahub/metadata/schemas/DashboardInfo.avsc +5 -5
- datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +12 -0
- datahub/metadata/schemas/DisplayProperties.avsc +62 -0
- datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
- datahub/metadata/schemas/PostInfo.avsc +28 -2
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/specific/dashboard.py +43 -1
- datahub/telemetry/telemetry.py +4 -4
- datahub/testing/check_imports.py +28 -0
- datahub/upgrade/upgrade.py +17 -9
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.4rc3.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from functools import partial
|
|
3
|
+
from typing import Any, Iterable, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
import progressbar
|
|
6
|
+
from pydantic import Field
|
|
7
|
+
|
|
8
|
+
from datahub.configuration.common import ConfigModel
|
|
9
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
10
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
11
|
+
from datahub.ingestion.api.decorators import (
|
|
12
|
+
SupportStatus,
|
|
13
|
+
config_class,
|
|
14
|
+
platform_name,
|
|
15
|
+
support_status,
|
|
16
|
+
)
|
|
17
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
|
|
18
|
+
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
19
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
20
|
+
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
21
|
+
from datahub.metadata.schema_classes import (
|
|
22
|
+
DomainsClass,
|
|
23
|
+
GlossaryTermAssociationClass,
|
|
24
|
+
MetadataChangeProposalClass,
|
|
25
|
+
OwnerClass,
|
|
26
|
+
OwnershipTypeClass,
|
|
27
|
+
TagAssociationClass,
|
|
28
|
+
)
|
|
29
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def apply_association_to_container(
|
|
35
|
+
container_urn: str,
|
|
36
|
+
association_urn: str,
|
|
37
|
+
association_type: str,
|
|
38
|
+
emit: bool = True,
|
|
39
|
+
graph: Optional[DataHubGraph] = None,
|
|
40
|
+
) -> Optional[List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]]:
|
|
41
|
+
"""
|
|
42
|
+
Common function to add either tags, terms, domains, or owners to child datasets (for now).
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
container_urn: The URN of the container
|
|
46
|
+
association_urn: The URN of the tag, term, or user to apply
|
|
47
|
+
association_type: One of 'tag', 'term', 'domain' or 'owner'
|
|
48
|
+
"""
|
|
49
|
+
urns: List[str] = [container_urn]
|
|
50
|
+
if not graph:
|
|
51
|
+
graph = get_default_graph()
|
|
52
|
+
logger.info(f"Using {graph}")
|
|
53
|
+
urns.extend(
|
|
54
|
+
graph.get_urns_by_filter(
|
|
55
|
+
container=container_urn,
|
|
56
|
+
batch_size=1000,
|
|
57
|
+
entity_types=["dataset", "container"],
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
all_patches: List[Any] = []
|
|
62
|
+
for urn in urns:
|
|
63
|
+
builder = DatasetPatchBuilder(urn)
|
|
64
|
+
patches: List[Any] = []
|
|
65
|
+
if association_type == "tag":
|
|
66
|
+
patches = builder.add_tag(TagAssociationClass(association_urn)).build()
|
|
67
|
+
elif association_type == "term":
|
|
68
|
+
patches = builder.add_term(
|
|
69
|
+
GlossaryTermAssociationClass(association_urn)
|
|
70
|
+
).build()
|
|
71
|
+
elif association_type == "owner":
|
|
72
|
+
patches = builder.add_owner(
|
|
73
|
+
OwnerClass(
|
|
74
|
+
owner=association_urn,
|
|
75
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
76
|
+
)
|
|
77
|
+
).build()
|
|
78
|
+
elif association_type == "domain":
|
|
79
|
+
patches = [
|
|
80
|
+
MetadataChangeProposalWrapper(
|
|
81
|
+
entityUrn=urn,
|
|
82
|
+
aspect=DomainsClass(domains=[association_urn]),
|
|
83
|
+
)
|
|
84
|
+
]
|
|
85
|
+
all_patches.extend(patches)
|
|
86
|
+
if emit:
|
|
87
|
+
mcps_iter = progressbar.progressbar(all_patches, redirect_stdout=True)
|
|
88
|
+
for mcp in mcps_iter:
|
|
89
|
+
graph.emit(mcp)
|
|
90
|
+
return None
|
|
91
|
+
else:
|
|
92
|
+
return all_patches
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DomainApplyConfig(ConfigModel):
|
|
96
|
+
assets: List[str] = Field(
|
|
97
|
+
default_factory=list,
|
|
98
|
+
description="List of assets to apply domain hierarchichaly. Currently only containers and datasets are supported",
|
|
99
|
+
)
|
|
100
|
+
domain_urn: str = Field(default="")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class TagApplyConfig(ConfigModel):
|
|
104
|
+
assets: List[str] = Field(
|
|
105
|
+
default_factory=list,
|
|
106
|
+
description="List of assets to apply tag hierarchichaly. Currently only containers and datasets are supported",
|
|
107
|
+
)
|
|
108
|
+
tag_urn: str = Field(default="")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TermApplyConfig(ConfigModel):
|
|
112
|
+
assets: List[str] = Field(
|
|
113
|
+
default_factory=list,
|
|
114
|
+
description="List of assets to apply term hierarchichaly. Currently only containers and datasets are supported",
|
|
115
|
+
)
|
|
116
|
+
term_urn: str = Field(default="")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class OwnerApplyConfig(ConfigModel):
|
|
120
|
+
assets: List[str] = Field(
|
|
121
|
+
default_factory=list,
|
|
122
|
+
description="List of assets to apply owner hierarchichaly. Currently only containers and datasets are supported",
|
|
123
|
+
)
|
|
124
|
+
owner_urn: str = Field(default="")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class DataHubApplyConfig(ConfigModel):
|
|
128
|
+
domain_apply: Optional[List[DomainApplyConfig]] = Field(
|
|
129
|
+
default=None,
|
|
130
|
+
description="List to apply domains to assets",
|
|
131
|
+
)
|
|
132
|
+
tag_apply: Optional[List[TagApplyConfig]] = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
description="List to apply tags to assets",
|
|
135
|
+
)
|
|
136
|
+
term_apply: Optional[List[TermApplyConfig]] = Field(
|
|
137
|
+
default=None,
|
|
138
|
+
description="List to apply terms to assets",
|
|
139
|
+
)
|
|
140
|
+
owner_apply: Optional[List[OwnerApplyConfig]] = Field(
|
|
141
|
+
default=None,
|
|
142
|
+
description="List to apply owners to assets",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@platform_name("DataHubApply")
|
|
147
|
+
@config_class(DataHubApplyConfig)
|
|
148
|
+
@support_status(SupportStatus.TESTING)
|
|
149
|
+
class DataHubApplySource(Source):
|
|
150
|
+
"""
|
|
151
|
+
This source is a helper over CLI
|
|
152
|
+
so people can use the helper to apply various metadata changes to DataHub
|
|
153
|
+
via Managed Ingestion
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
def __init__(self, ctx: PipelineContext, config: DataHubApplyConfig):
|
|
157
|
+
self.ctx = ctx
|
|
158
|
+
self.config = config
|
|
159
|
+
self.report = SourceReport()
|
|
160
|
+
self.graph = ctx.require_graph()
|
|
161
|
+
|
|
162
|
+
def _yield_workunits(
|
|
163
|
+
self,
|
|
164
|
+
proposals: List[
|
|
165
|
+
Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]
|
|
166
|
+
],
|
|
167
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
168
|
+
for proposal in proposals:
|
|
169
|
+
if isinstance(proposal, MetadataChangeProposalWrapper):
|
|
170
|
+
yield proposal.as_workunit()
|
|
171
|
+
else:
|
|
172
|
+
yield MetadataWorkUnit(
|
|
173
|
+
id=MetadataWorkUnit.generate_workunit_id(proposal),
|
|
174
|
+
mcp_raw=proposal,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def _handle_assets(
|
|
178
|
+
self, assets: List[str], apply_urn: str, apply_type: str
|
|
179
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
180
|
+
for asset in assets:
|
|
181
|
+
change_proposals = apply_association_to_container(
|
|
182
|
+
asset, apply_urn, apply_type, emit=False, graph=self.graph
|
|
183
|
+
)
|
|
184
|
+
assert change_proposals is not None
|
|
185
|
+
yield from self._yield_workunits(change_proposals)
|
|
186
|
+
|
|
187
|
+
def _yield_domain(self) -> Iterable[MetadataWorkUnit]:
|
|
188
|
+
if not self.config.domain_apply:
|
|
189
|
+
return
|
|
190
|
+
for apply in self.config.domain_apply:
|
|
191
|
+
yield from self._handle_assets(apply.assets, apply.domain_urn, "domain")
|
|
192
|
+
|
|
193
|
+
def _yield_tag(self) -> Iterable[MetadataWorkUnit]:
|
|
194
|
+
if not self.config.tag_apply:
|
|
195
|
+
return
|
|
196
|
+
for apply in self.config.tag_apply:
|
|
197
|
+
yield from self._handle_assets(apply.assets, apply.tag_urn, "tag")
|
|
198
|
+
|
|
199
|
+
def _yield_term(self) -> Iterable[MetadataWorkUnit]:
|
|
200
|
+
if not self.config.term_apply:
|
|
201
|
+
return
|
|
202
|
+
for apply in self.config.term_apply:
|
|
203
|
+
yield from self._handle_assets(apply.assets, apply.term_urn, "term")
|
|
204
|
+
|
|
205
|
+
def _yield_owner(self) -> Iterable[MetadataWorkUnit]:
|
|
206
|
+
if not self.config.owner_apply:
|
|
207
|
+
return
|
|
208
|
+
for apply in self.config.owner_apply:
|
|
209
|
+
yield from self._handle_assets(apply.assets, apply.owner_urn, "owner")
|
|
210
|
+
|
|
211
|
+
def get_workunits_internal(
|
|
212
|
+
self,
|
|
213
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
214
|
+
yield from self._yield_domain()
|
|
215
|
+
yield from self._yield_tag()
|
|
216
|
+
yield from self._yield_term()
|
|
217
|
+
yield from self._yield_owner()
|
|
218
|
+
|
|
219
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
220
|
+
return [partial(auto_workunit_reporter, self.get_report())]
|
|
221
|
+
|
|
222
|
+
def get_report(self) -> SourceReport:
|
|
223
|
+
return self.report
|
|
@@ -113,6 +113,7 @@ from datahub.metadata.schema_classes import (
|
|
|
113
113
|
)
|
|
114
114
|
from datahub.utilities.delta import delta_type_to_hive_type
|
|
115
115
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
116
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
116
117
|
|
|
117
118
|
logger = logging.getLogger(__name__)
|
|
118
119
|
|
|
@@ -218,8 +219,9 @@ class GlueSourceConfig(
|
|
|
218
219
|
|
|
219
220
|
@dataclass
|
|
220
221
|
class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
222
|
+
catalog_id: Optional[str] = None
|
|
221
223
|
tables_scanned = 0
|
|
222
|
-
filtered:
|
|
224
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
223
225
|
databases: EntityFilterReport = EntityFilterReport.field(type="database")
|
|
224
226
|
|
|
225
227
|
num_job_script_location_missing: int = 0
|
|
@@ -315,6 +317,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
315
317
|
self.extract_owners = config.extract_owners
|
|
316
318
|
self.source_config = config
|
|
317
319
|
self.report = GlueSourceReport()
|
|
320
|
+
self.report.catalog_id = self.source_config.catalog_id
|
|
318
321
|
self.glue_client = config.glue_client
|
|
319
322
|
self.s3_client = config.s3_client
|
|
320
323
|
self.extract_transforms = config.extract_transforms
|
|
@@ -744,7 +747,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
744
747
|
for tables in self.get_tables_from_database(database):
|
|
745
748
|
all_tables.append(tables)
|
|
746
749
|
except Exception as e:
|
|
747
|
-
self.report.
|
|
750
|
+
self.report.warning(
|
|
748
751
|
message="Failed to get tables from database",
|
|
749
752
|
context=database["Name"],
|
|
750
753
|
exc=e,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import Dict,
|
|
2
|
+
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
4
|
from pydantic.fields import Field
|
|
5
5
|
|
|
@@ -9,6 +9,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
9
9
|
StatefulIngestionConfigBase,
|
|
10
10
|
StatefulStaleMetadataRemovalConfig,
|
|
11
11
|
)
|
|
12
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class SagemakerSourceConfig(
|
|
@@ -42,7 +43,7 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
|
|
|
42
43
|
jobs_scanned = 0
|
|
43
44
|
jobs_processed = 0
|
|
44
45
|
datasets_scanned = 0
|
|
45
|
-
filtered:
|
|
46
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
46
47
|
model_endpoint_lineage = 0
|
|
47
48
|
model_group_lineage = 0
|
|
48
49
|
|
|
@@ -141,7 +141,7 @@ class BigQueryV2Report(
|
|
|
141
141
|
profiling_skipped_invalid_partition_type: Dict[str, str] = field(
|
|
142
142
|
default_factory=TopKDict
|
|
143
143
|
)
|
|
144
|
-
profiling_skipped_partition_profiling_disabled:
|
|
144
|
+
profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
|
|
145
145
|
default_factory=LossyList
|
|
146
146
|
)
|
|
147
147
|
allow_pattern: Optional[str] = None
|
|
@@ -488,7 +488,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
488
488
|
) -> Dict:
|
|
489
489
|
if re.match("^https?://", uri):
|
|
490
490
|
return json.loads(requests.get(uri).text)
|
|
491
|
-
elif
|
|
491
|
+
elif is_s3_uri(uri):
|
|
492
492
|
u = urlparse(uri)
|
|
493
493
|
assert aws_connection
|
|
494
494
|
response = aws_connection.get_s3_client().get_object(
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from datahub.ingestion.api.source import SourceReport
|
|
5
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@dataclasses.dataclass
|
|
9
9
|
class DeltaLakeSourceReport(SourceReport):
|
|
10
10
|
files_scanned = 0
|
|
11
|
-
filtered:
|
|
11
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
12
12
|
|
|
13
13
|
def report_file_scanned(self) -> None:
|
|
14
14
|
self.files_scanned += 1
|
|
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
|
|
|
68
68
|
StringTypeClass,
|
|
69
69
|
UnionTypeClass,
|
|
70
70
|
)
|
|
71
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
71
72
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
72
73
|
|
|
73
74
|
MAX_ITEMS_TO_RETRIEVE = 100
|
|
@@ -120,7 +121,7 @@ class DynamoDBConfig(
|
|
|
120
121
|
|
|
121
122
|
@dataclass
|
|
122
123
|
class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
|
|
123
|
-
filtered:
|
|
124
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
124
125
|
|
|
125
126
|
def report_dropped(self, name: str) -> None:
|
|
126
127
|
self.filtered.append(name)
|
|
@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
|
|
|
62
62
|
SubTypesClass,
|
|
63
63
|
)
|
|
64
64
|
from datahub.utilities.config_clean import remove_protocol
|
|
65
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
65
66
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
66
67
|
|
|
67
68
|
logger = logging.getLogger(__name__)
|
|
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
|
|
|
189
190
|
@dataclass
|
|
190
191
|
class ElasticsearchSourceReport(SourceReport):
|
|
191
192
|
index_scanned: int = 0
|
|
192
|
-
filtered:
|
|
193
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
193
194
|
|
|
194
195
|
def report_index_scanned(self, index: str) -> None:
|
|
195
196
|
self.index_scanned += 1
|
|
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
115
115
|
)
|
|
116
116
|
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
|
|
117
117
|
default=None,
|
|
118
|
-
description="A positive integer that specifies the maximum number of columns to profile for
|
|
118
|
+
description="A positive integer that specifies the maximum number of columns to profile for "
|
|
119
|
+
"any table. `None` implies all columns. The cost of profiling goes up significantly as the "
|
|
120
|
+
"number of columns to profile goes up.",
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
|
|
122
124
|
default=None,
|
|
123
|
-
description="Profile table only if it has been updated since these many number of days.
|
|
125
|
+
description="Profile table only if it has been updated since these many number of days. "
|
|
126
|
+
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
|
+
"Supported only in `snowflake` and `BigQuery`.",
|
|
124
128
|
)
|
|
125
129
|
|
|
126
130
|
profile_table_size_limit: Optional[int] = Field(
|
|
127
131
|
default=5,
|
|
128
132
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
129
|
-
"no limit on the size of tables to profile. Supported only in `
|
|
130
|
-
"Supported for `
|
|
133
|
+
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
|
+
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
131
135
|
)
|
|
132
136
|
|
|
133
137
|
profile_table_row_limit: Optional[int] = Field(
|
|
134
138
|
default=5000000,
|
|
135
|
-
description="Profile tables only if their row count is less than specified count.
|
|
136
|
-
"no limit on the row count of tables to profile. Supported only in
|
|
137
|
-
"Supported for `
|
|
139
|
+
description="Profile tables only if their row count is less than specified count. "
|
|
140
|
+
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
|
+
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
138
142
|
)
|
|
139
143
|
|
|
140
144
|
profile_table_row_count_estimate_only: bool = Field(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
5
|
from humanfriendly import format_timespan
|
|
6
6
|
from pydantic import Field, validator
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
|
|
|
20
20
|
OperationConfig,
|
|
21
21
|
is_profiling_enabled,
|
|
22
22
|
)
|
|
23
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
23
24
|
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
@@ -198,7 +199,7 @@ class TimingClass:
|
|
|
198
199
|
class IcebergSourceReport(StaleEntityRemovalSourceReport):
|
|
199
200
|
tables_scanned: int = 0
|
|
200
201
|
entities_profiled: int = 0
|
|
201
|
-
filtered:
|
|
202
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
202
203
|
load_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
203
204
|
processing_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
204
205
|
profiling_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
|
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.common import AllowDenyPattern
|
|
15
15
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
16
|
+
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
16
17
|
from datahub.emitter.mce_builder import make_group_urn, make_user_urn
|
|
17
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
18
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
|
|
|
51
52
|
OriginTypeClass,
|
|
52
53
|
StatusClass,
|
|
53
54
|
)
|
|
55
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
54
56
|
|
|
55
57
|
logger = logging.getLogger(__name__)
|
|
56
58
|
|
|
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
132
134
|
description="regex patterns for groups to include in ingestion.",
|
|
133
135
|
)
|
|
134
136
|
|
|
135
|
-
|
|
136
|
-
filtered_tracking: bool = Field(
|
|
137
|
-
default=True,
|
|
138
|
-
description="If enabled, report will contain names of filtered users and groups.",
|
|
139
|
-
)
|
|
137
|
+
_remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
|
|
140
138
|
|
|
141
139
|
# Optional: Whether to mask sensitive information from workunit ID's. On by default.
|
|
142
140
|
mask_group_id: bool = Field(
|
|
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
156
154
|
|
|
157
155
|
@dataclass
|
|
158
156
|
class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
|
159
|
-
filtered:
|
|
160
|
-
filtered_tracking: bool = field(default=True, repr=False)
|
|
161
|
-
filtered_count: int = field(default=0)
|
|
157
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
162
158
|
|
|
163
159
|
def report_filtered(self, name: str) -> None:
|
|
164
|
-
self.
|
|
165
|
-
if self.filtered_tracking:
|
|
166
|
-
self.filtered.append(name)
|
|
160
|
+
self.filtered.append(name)
|
|
167
161
|
|
|
168
162
|
|
|
169
163
|
# Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
|
|
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
|
|
|
266
260
|
def __init__(self, config: AzureADConfig, ctx: PipelineContext):
|
|
267
261
|
super().__init__(config, ctx)
|
|
268
262
|
self.config = config
|
|
269
|
-
self.report = AzureADSourceReport(
|
|
270
|
-
filtered_tracking=self.config.filtered_tracking
|
|
271
|
-
)
|
|
263
|
+
self.report = AzureADSourceReport()
|
|
272
264
|
session = requests.Session()
|
|
273
265
|
retries = Retry(
|
|
274
266
|
total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
|
|
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
|
|
|
50
50
|
OriginTypeClass,
|
|
51
51
|
StatusClass,
|
|
52
52
|
)
|
|
53
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
53
54
|
|
|
54
55
|
logger = logging.getLogger(__name__)
|
|
55
56
|
nest_asyncio.apply()
|
|
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
|
|
|
173
174
|
|
|
174
175
|
@dataclass
|
|
175
176
|
class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
176
|
-
filtered:
|
|
177
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
177
178
|
|
|
178
179
|
def report_filtered(self, name: str) -> None:
|
|
179
180
|
self.filtered.append(name)
|
|
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
|
|
|
73
73
|
OwnershipSourceTypeClass,
|
|
74
74
|
SubTypesClass,
|
|
75
75
|
)
|
|
76
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
76
77
|
from datahub.utilities.mapping import Constants, OperationProcessor
|
|
77
78
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
78
79
|
from datahub.utilities.str_enum import StrEnum
|
|
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
|
|
|
190
191
|
@dataclass
|
|
191
192
|
class KafkaSourceReport(StaleEntityRemovalSourceReport):
|
|
192
193
|
topics_scanned: int = 0
|
|
193
|
-
filtered:
|
|
194
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
195
|
|
|
195
196
|
def report_topic_scanned(self, topic: str) -> None:
|
|
196
197
|
self.topics_scanned += 1
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
16
16
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
17
|
StatefulIngestionConfigBase,
|
|
18
18
|
)
|
|
19
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
|
|
|
83
84
|
@dataclass
|
|
84
85
|
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
85
86
|
connectors_scanned: int = 0
|
|
86
|
-
filtered:
|
|
87
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
87
88
|
|
|
88
89
|
def report_connector_scanned(self, connector: str) -> None:
|
|
89
90
|
self.connectors_scanned += 1
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
|
|
|
37
37
|
CorpUserSnapshotClass,
|
|
38
38
|
GroupMembershipClass,
|
|
39
39
|
)
|
|
40
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
40
41
|
|
|
41
42
|
# default mapping for attrs
|
|
42
43
|
user_attrs_map: Dict[str, Any] = {}
|
|
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
160
161
|
|
|
161
162
|
@dataclasses.dataclass
|
|
162
163
|
class LDAPSourceReport(StaleEntityRemovalSourceReport):
|
|
163
|
-
dropped_dns:
|
|
164
|
+
dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
164
165
|
|
|
165
166
|
def report_dropped(self, dn: str) -> None:
|
|
166
167
|
self.dropped_dns.append(dn)
|
|
@@ -177,7 +177,9 @@ def _get_generic_definition(
|
|
|
177
177
|
class LookerConnectionDefinition(ConfigModel):
|
|
178
178
|
platform: str
|
|
179
179
|
default_db: str
|
|
180
|
-
default_schema: Optional[str]
|
|
180
|
+
default_schema: Optional[str] = (
|
|
181
|
+
None # Optional since some sources are two-level only
|
|
182
|
+
)
|
|
181
183
|
platform_instance: Optional[str] = None
|
|
182
184
|
platform_env: Optional[str] = Field(
|
|
183
185
|
default=None,
|
|
@@ -32,6 +32,12 @@ class LookerField:
|
|
|
32
32
|
sql: Optional[str]
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
@dataclass
|
|
36
|
+
class LookerConstant:
|
|
37
|
+
name: str
|
|
38
|
+
value: str
|
|
39
|
+
|
|
40
|
+
|
|
35
41
|
@dataclass
|
|
36
42
|
class LookerModel:
|
|
37
43
|
connection: str
|
|
@@ -75,6 +81,7 @@ class LookerModel:
|
|
|
75
81
|
try:
|
|
76
82
|
parsed = load_and_preprocess_file(
|
|
77
83
|
path=included_file,
|
|
84
|
+
reporter=reporter,
|
|
78
85
|
source_config=source_config,
|
|
79
86
|
)
|
|
80
87
|
included_explores = parsed.get("explores", [])
|
|
@@ -217,6 +224,7 @@ class LookerModel:
|
|
|
217
224
|
try:
|
|
218
225
|
parsed = load_and_preprocess_file(
|
|
219
226
|
path=included_file,
|
|
227
|
+
reporter=reporter,
|
|
220
228
|
source_config=source_config,
|
|
221
229
|
)
|
|
222
230
|
seen_so_far.add(included_file)
|
|
@@ -4,7 +4,10 @@ from dataclasses import replace
|
|
|
4
4
|
from typing import Dict, Optional
|
|
5
5
|
|
|
6
6
|
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
|
|
7
|
-
from datahub.ingestion.source.looker.looker_dataclasses import
|
|
7
|
+
from datahub.ingestion.source.looker.looker_dataclasses import (
|
|
8
|
+
LookerConstant,
|
|
9
|
+
LookerViewFile,
|
|
10
|
+
)
|
|
8
11
|
from datahub.ingestion.source.looker.looker_template_language import (
|
|
9
12
|
load_and_preprocess_file,
|
|
10
13
|
)
|
|
@@ -30,12 +33,14 @@ class LookerViewFileLoader:
|
|
|
30
33
|
base_projects_folder: Dict[str, pathlib.Path],
|
|
31
34
|
reporter: LookMLSourceReport,
|
|
32
35
|
source_config: LookMLSourceConfig,
|
|
36
|
+
manifest_constants: Dict[str, LookerConstant] = {},
|
|
33
37
|
) -> None:
|
|
34
38
|
self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
|
|
35
39
|
self._root_project_name = root_project_name
|
|
36
40
|
self._base_projects_folder = base_projects_folder
|
|
37
41
|
self.reporter = reporter
|
|
38
42
|
self.source_config = source_config
|
|
43
|
+
self.manifest_constants = manifest_constants
|
|
39
44
|
|
|
40
45
|
def _load_viewfile(
|
|
41
46
|
self, project_name: str, path: str, reporter: LookMLSourceReport
|
|
@@ -60,7 +65,7 @@ class LookerViewFileLoader:
|
|
|
60
65
|
with open(path) as file:
|
|
61
66
|
raw_file_content = file.read()
|
|
62
67
|
except Exception as e:
|
|
63
|
-
self.reporter.
|
|
68
|
+
self.reporter.report_warning(
|
|
64
69
|
title="LKML File Loading Error",
|
|
65
70
|
message="A lookml file is not present on local storage or GitHub",
|
|
66
71
|
context=f"file path: {path}",
|
|
@@ -71,9 +76,15 @@ class LookerViewFileLoader:
|
|
|
71
76
|
try:
|
|
72
77
|
logger.debug(f"Loading viewfile {path}")
|
|
73
78
|
|
|
79
|
+
# load_and preprocess_file is called multiple times for loading view file from multiple flows.
|
|
80
|
+
# Flag resolve_constants is a hack to avoid passing around manifest_constants from all of the flows.
|
|
81
|
+
# This is fine as rest of flows do not need resolution of constants.
|
|
74
82
|
parsed = load_and_preprocess_file(
|
|
75
83
|
path=path,
|
|
84
|
+
reporter=self.reporter,
|
|
76
85
|
source_config=self.source_config,
|
|
86
|
+
resolve_constants=True,
|
|
87
|
+
manifest_constants=self.manifest_constants,
|
|
77
88
|
)
|
|
78
89
|
|
|
79
90
|
looker_viewfile = LookerViewFile.from_looker_dict(
|
|
@@ -90,7 +101,7 @@ class LookerViewFileLoader:
|
|
|
90
101
|
self.viewfile_cache[path] = looker_viewfile
|
|
91
102
|
return looker_viewfile
|
|
92
103
|
except Exception as e:
|
|
93
|
-
self.reporter.
|
|
104
|
+
self.reporter.report_warning(
|
|
94
105
|
title="LKML File Parsing Error",
|
|
95
106
|
message="The input file is not lookml file",
|
|
96
107
|
context=f"file path: {path}",
|