acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2609 -2608
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +74 -73
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +3 -3
- datahub/api/entities/external/external_tag.py +6 -4
- datahub/api/entities/external/lake_formation_external_entites.py +50 -49
- datahub/api/entities/external/restricted_text.py +105 -180
- datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
- datahub/api/entities/forms/forms.py +3 -3
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/quickstart_versioning.py +1 -1
- datahub/cli/specific/assertions_cli.py +37 -2
- datahub/cli/specific/datacontract_cli.py +54 -4
- datahub/emitter/rest_emitter.py +18 -5
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
- datahub/ingestion/api/report.py +21 -2
- datahub/ingestion/api/source.py +81 -7
- datahub/ingestion/autogenerated/capability_summary.json +47 -19
- datahub/ingestion/graph/client.py +19 -3
- datahub/ingestion/sink/datahub_rest.py +2 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +9 -0
- datahub/ingestion/source/aws/glue.py +18 -2
- datahub/ingestion/source/aws/tag_entities.py +4 -4
- datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
- datahub/ingestion/source/dbt/dbt_common.py +10 -0
- datahub/ingestion/source/delta_lake/source.py +8 -1
- datahub/ingestion/source/dremio/dremio_source.py +19 -2
- datahub/ingestion/source/fivetran/fivetran.py +9 -3
- datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
- datahub/ingestion/source/ge_data_profiler.py +8 -0
- datahub/ingestion/source/grafana/models.py +6 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/hex/query_fetcher.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +4 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
- datahub/ingestion/source/powerbi/powerbi.py +4 -1
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/salesforce.py +8 -0
- datahub/ingestion/source/slack/slack.py +7 -14
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
- datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
- datahub/ingestion/source/sql/hive_metastore.py +8 -0
- datahub/ingestion/source/sql/teradata.py +8 -1
- datahub/ingestion/source/sql/trino.py +9 -0
- datahub/ingestion/source/tableau/tableau.py +1 -1
- datahub/ingestion/source/unity/config.py +36 -1
- datahub/ingestion/source/unity/proxy.py +332 -46
- datahub/ingestion/source/unity/proxy_types.py +12 -2
- datahub/ingestion/source/unity/source.py +91 -34
- datahub/ingestion/source/unity/tag_entities.py +5 -5
- datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/metadata/_internal_schema_classes.py +513 -513
- datahub/metadata/_urns/urn_defs.py +1684 -1684
- datahub/metadata/schema.avsc +16745 -16348
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/sdk/entity_client.py +22 -7
- datahub/sdk/search_client.py +3 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataset.py +37 -59
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/server_config_util.py +2 -1
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
|
@@ -44,7 +44,7 @@ def get_minimum_supported_version_message(version: str) -> str:
|
|
|
44
44
|
class QuickstartExecutionPlan(BaseModel):
|
|
45
45
|
composefile_git_ref: str
|
|
46
46
|
docker_tag: str
|
|
47
|
-
mysql_tag: Optional[str]
|
|
47
|
+
mysql_tag: Optional[str] = None
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def _is_it_a_version(version: str) -> bool:
|
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DEPRECATED: This assertions CLI is no longer supported and will be removed in a future version.
|
|
3
|
+
Please use alternative methods for managing assertions in DataHub.
|
|
4
|
+
"""
|
|
5
|
+
|
|
1
6
|
import logging
|
|
2
7
|
import os
|
|
3
8
|
from pathlib import Path
|
|
@@ -26,7 +31,18 @@ REPORT_FILE_NAME = "compile_report.json"
|
|
|
26
31
|
|
|
27
32
|
@click.group(cls=DefaultGroup, default="upsert")
|
|
28
33
|
def assertions() -> None:
|
|
29
|
-
"""A group of commands to interact with the Assertion entity in DataHub.
|
|
34
|
+
"""A group of commands to interact with the Assertion entity in DataHub.
|
|
35
|
+
|
|
36
|
+
⚠️ DEPRECATED: This assertions CLI is no longer supported and will be removed
|
|
37
|
+
in a future version. Please use alternative methods for managing assertions in DataHub.
|
|
38
|
+
"""
|
|
39
|
+
click.secho(
|
|
40
|
+
"⚠️ WARNING: The assertions CLI is deprecated and no longer supported. "
|
|
41
|
+
"It may be removed in a future version. Please use alternative methods for managing assertions in DataHub.",
|
|
42
|
+
fg="yellow",
|
|
43
|
+
bold=True,
|
|
44
|
+
err=True,
|
|
45
|
+
)
|
|
30
46
|
pass
|
|
31
47
|
|
|
32
48
|
|
|
@@ -34,7 +50,16 @@ def assertions() -> None:
|
|
|
34
50
|
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
35
51
|
@upgrade.check_upgrade
|
|
36
52
|
def upsert(file: str) -> None:
|
|
37
|
-
"""Upsert (create or update) a set of assertions in DataHub.
|
|
53
|
+
"""Upsert (create or update) a set of assertions in DataHub.
|
|
54
|
+
|
|
55
|
+
⚠️ DEPRECATED: This command is deprecated and no longer supported.
|
|
56
|
+
"""
|
|
57
|
+
click.secho(
|
|
58
|
+
"⚠️ WARNING: The 'upsert' command is deprecated and no longer supported.",
|
|
59
|
+
fg="yellow",
|
|
60
|
+
bold=True,
|
|
61
|
+
err=True,
|
|
62
|
+
)
|
|
38
63
|
|
|
39
64
|
assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file)
|
|
40
65
|
|
|
@@ -78,7 +103,15 @@ def compile(
|
|
|
78
103
|
In future, we may introduce separate command to automatically apply these compiled changes
|
|
79
104
|
in assertion platform. Currently, generated result artifacts are stored in target folder
|
|
80
105
|
unless another folder is specified using option `--output-to <folder>`.
|
|
106
|
+
|
|
107
|
+
⚠️ DEPRECATED: This command is deprecated and no longer supported.
|
|
81
108
|
"""
|
|
109
|
+
click.secho(
|
|
110
|
+
"⚠️ WARNING: The 'compile' command is deprecated and no longer supported.",
|
|
111
|
+
fg="yellow",
|
|
112
|
+
bold=True,
|
|
113
|
+
err=True,
|
|
114
|
+
)
|
|
82
115
|
|
|
83
116
|
if platform not in ASSERTION_PLATFORMS:
|
|
84
117
|
click.secho(
|
|
@@ -146,3 +179,5 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
|
|
|
146
179
|
# Later:
|
|
147
180
|
# 3. execute compiled assertions on assertion platform (Later, requires connection details to platform),
|
|
148
181
|
# 4. cleanup assertions from assertion platform (generate artifacts. optionally execute)
|
|
182
|
+
#
|
|
183
|
+
# NOTE: This entire assertions CLI is deprecated and these TODOs will not be implemented.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import warnings
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
import click
|
|
@@ -14,15 +15,52 @@ logger = logging.getLogger(__name__)
|
|
|
14
15
|
|
|
15
16
|
@click.group(cls=DefaultGroup, default="upsert")
|
|
16
17
|
def datacontract() -> None:
|
|
17
|
-
"""
|
|
18
|
-
|
|
18
|
+
"""
|
|
19
|
+
A group of commands to interact with the DataContract entity in DataHub.
|
|
20
|
+
|
|
21
|
+
WARNING: This CLI is DEPRECATED and no longer supported.
|
|
22
|
+
Please migrate to alternative data contract solutions.
|
|
23
|
+
"""
|
|
24
|
+
# Issue deprecation warning
|
|
25
|
+
warnings.warn(
|
|
26
|
+
"The datacontract CLI is deprecated and no longer supported. "
|
|
27
|
+
"Please migrate to alternative data contract solutions.",
|
|
28
|
+
DeprecationWarning,
|
|
29
|
+
stacklevel=2,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Log deprecation message for runtime visibility
|
|
33
|
+
logger.warning(
|
|
34
|
+
"DEPRECATED: The datacontract CLI is no longer supported and will be removed in a future version. "
|
|
35
|
+
"Please migrate to alternative data contract solutions."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Display deprecation message to user
|
|
39
|
+
click.secho(
|
|
40
|
+
"⚠️ WARNING: This datacontract CLI is DEPRECATED and no longer supported.",
|
|
41
|
+
fg="yellow",
|
|
42
|
+
bold=True,
|
|
43
|
+
)
|
|
44
|
+
click.secho("Please migrate to alternative data contract solutions.", fg="yellow")
|
|
19
45
|
|
|
20
46
|
|
|
21
47
|
@datacontract.command()
|
|
22
48
|
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
23
49
|
@upgrade.check_upgrade
|
|
24
50
|
def upsert(file: str) -> None:
|
|
25
|
-
"""
|
|
51
|
+
"""
|
|
52
|
+
Upsert (create or update) a Data Contract in DataHub.
|
|
53
|
+
|
|
54
|
+
WARNING: This command is DEPRECATED and no longer supported.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
click.secho(
|
|
58
|
+
"⚠️ WARNING: The 'upsert' command is deprecated and no longer supported.",
|
|
59
|
+
fg="yellow",
|
|
60
|
+
bold=True,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
logger.warning("DEPRECATED: datacontract upsert command is no longer supported")
|
|
26
64
|
|
|
27
65
|
data_contract: DataContract = DataContract.from_yaml(file)
|
|
28
66
|
urn = data_contract.urn
|
|
@@ -59,7 +97,19 @@ def upsert(file: str) -> None:
|
|
|
59
97
|
@click.option("--hard/--soft", required=False, is_flag=True, default=False)
|
|
60
98
|
@upgrade.check_upgrade
|
|
61
99
|
def delete(urn: Optional[str], file: Optional[str], hard: bool) -> None:
|
|
62
|
-
"""
|
|
100
|
+
"""
|
|
101
|
+
Delete a Data Contract in DataHub. Defaults to a soft-delete. Use --hard to completely erase metadata.
|
|
102
|
+
|
|
103
|
+
WARNING: This command is DEPRECATED and no longer supported.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
click.secho(
|
|
107
|
+
"⚠️ WARNING: The 'delete' command is deprecated and no longer supported.",
|
|
108
|
+
fg="yellow",
|
|
109
|
+
bold=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
logger.warning("DEPRECATED: datacontract delete command is no longer supported")
|
|
63
113
|
|
|
64
114
|
if not urn:
|
|
65
115
|
if not file:
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -95,7 +95,7 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
|
|
|
95
95
|
TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
|
|
96
96
|
TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
|
|
97
97
|
|
|
98
|
-
# The limit is
|
|
98
|
+
# The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
|
|
99
99
|
# for overhead like request headers.
|
|
100
100
|
# This applies to pretty much all calls to GMS.
|
|
101
101
|
INGEST_MAX_PAYLOAD_BYTES = int(
|
|
@@ -586,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
586
586
|
"systemMetadata": system_metadata_obj,
|
|
587
587
|
}
|
|
588
588
|
payload = json.dumps(snapshot)
|
|
589
|
+
if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
|
|
590
|
+
logger.warning(
|
|
591
|
+
f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
592
|
+
"so this metadata will likely fail to be emitted."
|
|
593
|
+
)
|
|
589
594
|
|
|
590
595
|
self._emit_generic(url, payload)
|
|
591
596
|
|
|
@@ -764,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
764
769
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
765
770
|
|
|
766
771
|
mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
|
|
772
|
+
if len(mcp_objs) == 0:
|
|
773
|
+
return 0
|
|
767
774
|
|
|
768
775
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
769
776
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
770
|
-
mcp_obj_chunks: List[List[str]] = []
|
|
771
|
-
current_chunk_size =
|
|
777
|
+
mcp_obj_chunks: List[List[str]] = [[]]
|
|
778
|
+
current_chunk_size = 0
|
|
772
779
|
for mcp_obj in mcp_objs:
|
|
780
|
+
mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
|
|
773
781
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
774
782
|
if _DATAHUB_EMITTER_TRACE:
|
|
775
783
|
logger.debug(
|
|
776
|
-
f"Iterating through object with size {mcp_obj_size}
|
|
784
|
+
f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
|
|
785
|
+
)
|
|
786
|
+
if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
787
|
+
logger.warning(
|
|
788
|
+
f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
|
|
789
|
+
"so this metadata will likely fail to be emitted."
|
|
777
790
|
)
|
|
778
791
|
|
|
779
792
|
if (
|
|
@@ -786,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
786
799
|
current_chunk_size = 0
|
|
787
800
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
788
801
|
current_chunk_size += mcp_obj_size
|
|
789
|
-
if len(mcp_obj_chunks) >
|
|
802
|
+
if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
|
|
790
803
|
logger.debug(
|
|
791
804
|
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
792
805
|
)
|
|
@@ -90,7 +90,7 @@ class EnsureAspectSizeProcessor:
|
|
|
90
90
|
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
|
|
91
91
|
"""
|
|
92
92
|
for wu in stream:
|
|
93
|
-
logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
93
|
+
# logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
94
94
|
|
|
95
95
|
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
|
|
96
96
|
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -186,11 +186,19 @@ class ExamplesReport(Report, Closeable):
|
|
|
186
186
|
aspects: Dict[str, Dict[str, int]] = field(
|
|
187
187
|
default_factory=lambda: defaultdict(lambda: defaultdict(int))
|
|
188
188
|
)
|
|
189
|
+
# This counts existence of aspects for each entity/subtype
|
|
190
|
+
# This is used for the UI to calculate %age of entities with the aspect
|
|
189
191
|
aspects_by_subtypes: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
190
192
|
default_factory=lambda: defaultdict(
|
|
191
193
|
lambda: defaultdict(lambda: defaultdict(int))
|
|
192
194
|
)
|
|
193
195
|
)
|
|
196
|
+
# This counts all aspects for each entity/subtype
|
|
197
|
+
aspects_by_subtypes_full_count: Dict[str, Dict[str, Dict[str, int]]] = field(
|
|
198
|
+
default_factory=lambda: defaultdict(
|
|
199
|
+
lambda: defaultdict(lambda: defaultdict(int))
|
|
200
|
+
)
|
|
201
|
+
)
|
|
194
202
|
samples: Dict[str, Dict[str, List[str]]] = field(
|
|
195
203
|
default_factory=lambda: defaultdict(lambda: defaultdict(list))
|
|
196
204
|
)
|
|
@@ -399,6 +407,9 @@ class ExamplesReport(Report, Closeable):
|
|
|
399
407
|
entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
400
408
|
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
401
409
|
)
|
|
410
|
+
entity_subtype_aspect_counts_exist: Dict[str, Dict[str, Dict[str, int]]] = (
|
|
411
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
412
|
+
)
|
|
402
413
|
for row in self._file_based_dict.sql_query(query):
|
|
403
414
|
entity_type = row["entityType"]
|
|
404
415
|
sub_type = row["subTypes"]
|
|
@@ -410,15 +421,23 @@ class ExamplesReport(Report, Closeable):
|
|
|
410
421
|
entity_subtype_aspect_counts[entity_type][sub_type][aspect] += (
|
|
411
422
|
aspect_count * count
|
|
412
423
|
)
|
|
424
|
+
entity_subtype_aspect_counts_exist[entity_type][sub_type][aspect] += (
|
|
425
|
+
count
|
|
426
|
+
)
|
|
413
427
|
|
|
414
428
|
self.aspects.clear()
|
|
415
429
|
self.aspects_by_subtypes.clear()
|
|
416
|
-
|
|
430
|
+
self.aspects_by_subtypes_full_count.clear()
|
|
417
431
|
for entity_type, subtype_counts in entity_subtype_aspect_counts.items():
|
|
418
432
|
for sub_type, aspect_counts in subtype_counts.items():
|
|
419
433
|
for aspect, count in aspect_counts.items():
|
|
420
434
|
self.aspects[entity_type][aspect] += count
|
|
421
|
-
|
|
435
|
+
self.aspects_by_subtypes_full_count[entity_type][sub_type] = dict(
|
|
436
|
+
aspect_counts
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
for entity_type, subtype_counts in entity_subtype_aspect_counts_exist.items():
|
|
440
|
+
for sub_type, aspect_counts in subtype_counts.items():
|
|
422
441
|
self.aspects_by_subtypes[entity_type][sub_type] = dict(aspect_counts)
|
|
423
442
|
|
|
424
443
|
self.samples.clear()
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -81,11 +81,24 @@ class StructuredLogLevel(Enum):
|
|
|
81
81
|
ERROR = logging.ERROR
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
class StructuredLogCategory(Enum):
|
|
85
|
+
"""
|
|
86
|
+
This is used to categorise the errors mainly based on the biggest impact area
|
|
87
|
+
This is to be used to help in self-serve understand the impact of any log entry
|
|
88
|
+
More enums to be added as logs are updated to be self-serve
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
LINEAGE = "LINEAGE"
|
|
92
|
+
USAGE = "USAGE"
|
|
93
|
+
PROFILING = "PROFILING"
|
|
94
|
+
|
|
95
|
+
|
|
84
96
|
@dataclass
|
|
85
97
|
class StructuredLogEntry(Report):
|
|
86
98
|
title: Optional[str]
|
|
87
99
|
message: str
|
|
88
100
|
context: LossyList[str]
|
|
101
|
+
log_category: Optional[StructuredLogCategory] = None
|
|
89
102
|
|
|
90
103
|
|
|
91
104
|
@dataclass
|
|
@@ -108,9 +121,10 @@ class StructuredLogs(Report):
|
|
|
108
121
|
exc: Optional[BaseException] = None,
|
|
109
122
|
log: bool = False,
|
|
110
123
|
stacklevel: int = 1,
|
|
124
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
111
125
|
) -> None:
|
|
112
126
|
"""
|
|
113
|
-
Report a user-facing
|
|
127
|
+
Report a user-facing log for the ingestion run.
|
|
114
128
|
|
|
115
129
|
Args:
|
|
116
130
|
level: The level of the log entry.
|
|
@@ -118,6 +132,9 @@ class StructuredLogs(Report):
|
|
|
118
132
|
title: The category / heading to present on for this message in the UI.
|
|
119
133
|
context: Additional context (e.g. where, how) for the log entry.
|
|
120
134
|
exc: The exception associated with the event. We'll show the stack trace when in debug mode.
|
|
135
|
+
log_category: The type of the log entry. This is used to categorise the log entry.
|
|
136
|
+
log: Whether to log the entry to the console.
|
|
137
|
+
stacklevel: The stack level to use for the log entry.
|
|
121
138
|
"""
|
|
122
139
|
|
|
123
140
|
# One for this method, and one for the containing report_* call.
|
|
@@ -160,6 +177,7 @@ class StructuredLogs(Report):
|
|
|
160
177
|
title=title,
|
|
161
178
|
message=message,
|
|
162
179
|
context=context_list,
|
|
180
|
+
log_category=log_category,
|
|
163
181
|
)
|
|
164
182
|
else:
|
|
165
183
|
if context is not None:
|
|
@@ -219,9 +237,19 @@ class SourceReport(ExamplesReport):
|
|
|
219
237
|
context: Optional[str] = None,
|
|
220
238
|
title: Optional[LiteralString] = None,
|
|
221
239
|
exc: Optional[BaseException] = None,
|
|
240
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
222
241
|
) -> None:
|
|
242
|
+
"""
|
|
243
|
+
See docs of StructuredLogs.report_log for details of args
|
|
244
|
+
"""
|
|
223
245
|
self._structured_logs.report_log(
|
|
224
|
-
StructuredLogLevel.WARN,
|
|
246
|
+
StructuredLogLevel.WARN,
|
|
247
|
+
message,
|
|
248
|
+
title,
|
|
249
|
+
context,
|
|
250
|
+
exc,
|
|
251
|
+
log=False,
|
|
252
|
+
log_category=log_category,
|
|
225
253
|
)
|
|
226
254
|
|
|
227
255
|
def warning(
|
|
@@ -231,9 +259,19 @@ class SourceReport(ExamplesReport):
|
|
|
231
259
|
title: Optional[LiteralString] = None,
|
|
232
260
|
exc: Optional[BaseException] = None,
|
|
233
261
|
log: bool = True,
|
|
262
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
234
263
|
) -> None:
|
|
264
|
+
"""
|
|
265
|
+
See docs of StructuredLogs.report_log for details of args
|
|
266
|
+
"""
|
|
235
267
|
self._structured_logs.report_log(
|
|
236
|
-
StructuredLogLevel.WARN,
|
|
268
|
+
StructuredLogLevel.WARN,
|
|
269
|
+
message,
|
|
270
|
+
title,
|
|
271
|
+
context,
|
|
272
|
+
exc,
|
|
273
|
+
log=log,
|
|
274
|
+
log_category=log_category,
|
|
237
275
|
)
|
|
238
276
|
|
|
239
277
|
def report_failure(
|
|
@@ -243,9 +281,19 @@ class SourceReport(ExamplesReport):
|
|
|
243
281
|
title: Optional[LiteralString] = None,
|
|
244
282
|
exc: Optional[BaseException] = None,
|
|
245
283
|
log: bool = True,
|
|
284
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
246
285
|
) -> None:
|
|
286
|
+
"""
|
|
287
|
+
See docs of StructuredLogs.report_log for details of args
|
|
288
|
+
"""
|
|
247
289
|
self._structured_logs.report_log(
|
|
248
|
-
StructuredLogLevel.ERROR,
|
|
290
|
+
StructuredLogLevel.ERROR,
|
|
291
|
+
message,
|
|
292
|
+
title,
|
|
293
|
+
context,
|
|
294
|
+
exc,
|
|
295
|
+
log=log,
|
|
296
|
+
log_category=log_category,
|
|
249
297
|
)
|
|
250
298
|
|
|
251
299
|
def failure(
|
|
@@ -255,9 +303,19 @@ class SourceReport(ExamplesReport):
|
|
|
255
303
|
title: Optional[LiteralString] = None,
|
|
256
304
|
exc: Optional[BaseException] = None,
|
|
257
305
|
log: bool = True,
|
|
306
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
258
307
|
) -> None:
|
|
308
|
+
"""
|
|
309
|
+
See docs of StructuredLogs.report_log for details of args
|
|
310
|
+
"""
|
|
259
311
|
self._structured_logs.report_log(
|
|
260
|
-
StructuredLogLevel.ERROR,
|
|
312
|
+
StructuredLogLevel.ERROR,
|
|
313
|
+
message,
|
|
314
|
+
title,
|
|
315
|
+
context,
|
|
316
|
+
exc,
|
|
317
|
+
log=log,
|
|
318
|
+
log_category=log_category,
|
|
261
319
|
)
|
|
262
320
|
|
|
263
321
|
def info(
|
|
@@ -267,9 +325,19 @@ class SourceReport(ExamplesReport):
|
|
|
267
325
|
title: Optional[LiteralString] = None,
|
|
268
326
|
exc: Optional[BaseException] = None,
|
|
269
327
|
log: bool = True,
|
|
328
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
270
329
|
) -> None:
|
|
330
|
+
"""
|
|
331
|
+
See docs of StructuredLogs.report_log for details of args
|
|
332
|
+
"""
|
|
271
333
|
self._structured_logs.report_log(
|
|
272
|
-
StructuredLogLevel.INFO,
|
|
334
|
+
StructuredLogLevel.INFO,
|
|
335
|
+
message,
|
|
336
|
+
title,
|
|
337
|
+
context,
|
|
338
|
+
exc,
|
|
339
|
+
log=log,
|
|
340
|
+
log_category=log_category,
|
|
273
341
|
)
|
|
274
342
|
|
|
275
343
|
@contextlib.contextmanager
|
|
@@ -279,6 +347,7 @@ class SourceReport(ExamplesReport):
|
|
|
279
347
|
title: Optional[LiteralString] = None,
|
|
280
348
|
context: Optional[str] = None,
|
|
281
349
|
level: StructuredLogLevel = StructuredLogLevel.ERROR,
|
|
350
|
+
log_category: Optional[StructuredLogCategory] = None,
|
|
282
351
|
) -> Iterator[None]:
|
|
283
352
|
# Convenience method that helps avoid boilerplate try/except blocks.
|
|
284
353
|
# TODO: I'm not super happy with the naming here - it's not obvious that this
|
|
@@ -287,7 +356,12 @@ class SourceReport(ExamplesReport):
|
|
|
287
356
|
yield
|
|
288
357
|
except Exception as exc:
|
|
289
358
|
self._structured_logs.report_log(
|
|
290
|
-
level,
|
|
359
|
+
level,
|
|
360
|
+
message=message,
|
|
361
|
+
title=title,
|
|
362
|
+
context=context,
|
|
363
|
+
exc=exc,
|
|
364
|
+
log_category=log_category,
|
|
291
365
|
)
|
|
292
366
|
|
|
293
367
|
def __post_init__(self) -> None:
|
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
{
|
|
2
|
-
"generated_at": "2025-07-
|
|
2
|
+
"generated_at": "2025-07-31T12:54:30.557618+00:00",
|
|
3
3
|
"generated_by": "metadata-ingestion/scripts/capability_summary.py",
|
|
4
4
|
"plugin_details": {
|
|
5
5
|
"abs": {
|
|
6
6
|
"capabilities": [
|
|
7
|
+
{
|
|
8
|
+
"capability": "CONTAINERS",
|
|
9
|
+
"description": "Extract ABS containers and folders",
|
|
10
|
+
"subtype_modifier": [
|
|
11
|
+
"Folder",
|
|
12
|
+
"ABS container"
|
|
13
|
+
],
|
|
14
|
+
"supported": true
|
|
15
|
+
},
|
|
7
16
|
{
|
|
8
17
|
"capability": "DATA_PROFILING",
|
|
9
18
|
"description": "Optionally enabled via configuration",
|
|
@@ -468,7 +477,9 @@
|
|
|
468
477
|
{
|
|
469
478
|
"capability": "CONTAINERS",
|
|
470
479
|
"description": "Enabled by default",
|
|
471
|
-
"subtype_modifier":
|
|
480
|
+
"subtype_modifier": [
|
|
481
|
+
"Database"
|
|
482
|
+
],
|
|
472
483
|
"supported": true
|
|
473
484
|
},
|
|
474
485
|
{
|
|
@@ -531,13 +542,6 @@
|
|
|
531
542
|
"platform_name": "File Based Lineage",
|
|
532
543
|
"support_status": "CERTIFIED"
|
|
533
544
|
},
|
|
534
|
-
"datahub-mock-data": {
|
|
535
|
-
"capabilities": [],
|
|
536
|
-
"classname": "datahub.ingestion.source.mock_data.datahub_mock_data.DataHubMockDataSource",
|
|
537
|
-
"platform_id": "datahubmockdata",
|
|
538
|
-
"platform_name": "DataHubMockData",
|
|
539
|
-
"support_status": "TESTING"
|
|
540
|
-
},
|
|
541
545
|
"dbt": {
|
|
542
546
|
"capabilities": [
|
|
543
547
|
{
|
|
@@ -607,7 +611,9 @@
|
|
|
607
611
|
{
|
|
608
612
|
"capability": "CONTAINERS",
|
|
609
613
|
"description": "Enabled by default",
|
|
610
|
-
"subtype_modifier":
|
|
614
|
+
"subtype_modifier": [
|
|
615
|
+
"Folder"
|
|
616
|
+
],
|
|
611
617
|
"supported": true
|
|
612
618
|
},
|
|
613
619
|
{
|
|
@@ -643,6 +649,14 @@
|
|
|
643
649
|
"subtype_modifier": null,
|
|
644
650
|
"supported": true
|
|
645
651
|
},
|
|
652
|
+
{
|
|
653
|
+
"capability": "LINEAGE_FINE",
|
|
654
|
+
"description": "Extract column-level lineage",
|
|
655
|
+
"subtype_modifier": [
|
|
656
|
+
"Table"
|
|
657
|
+
],
|
|
658
|
+
"supported": true
|
|
659
|
+
},
|
|
646
660
|
{
|
|
647
661
|
"capability": "DATA_PROFILING",
|
|
648
662
|
"description": "Optionally enabled via configuration",
|
|
@@ -688,7 +702,9 @@
|
|
|
688
702
|
{
|
|
689
703
|
"capability": "LINEAGE_COARSE",
|
|
690
704
|
"description": "Enabled by default",
|
|
691
|
-
"subtype_modifier":
|
|
705
|
+
"subtype_modifier": [
|
|
706
|
+
"Table"
|
|
707
|
+
],
|
|
692
708
|
"supported": true
|
|
693
709
|
}
|
|
694
710
|
],
|
|
@@ -1229,8 +1245,7 @@
|
|
|
1229
1245
|
"capability": "CONTAINERS",
|
|
1230
1246
|
"description": "Enabled by default",
|
|
1231
1247
|
"subtype_modifier": [
|
|
1232
|
-
"
|
|
1233
|
-
"Schema"
|
|
1248
|
+
"Catalog"
|
|
1234
1249
|
],
|
|
1235
1250
|
"supported": true
|
|
1236
1251
|
},
|
|
@@ -2387,8 +2402,9 @@
|
|
|
2387
2402
|
},
|
|
2388
2403
|
{
|
|
2389
2404
|
"capability": "LINEAGE_COARSE",
|
|
2390
|
-
"description": "
|
|
2405
|
+
"description": "Extract table-level lineage",
|
|
2391
2406
|
"subtype_modifier": [
|
|
2407
|
+
"Table",
|
|
2392
2408
|
"View"
|
|
2393
2409
|
],
|
|
2394
2410
|
"supported": true
|
|
@@ -2411,8 +2427,7 @@
|
|
|
2411
2427
|
"capability": "CONTAINERS",
|
|
2412
2428
|
"description": "Enabled by default",
|
|
2413
2429
|
"subtype_modifier": [
|
|
2414
|
-
"
|
|
2415
|
-
"Schema"
|
|
2430
|
+
"Catalog"
|
|
2416
2431
|
],
|
|
2417
2432
|
"supported": true
|
|
2418
2433
|
},
|
|
@@ -2598,7 +2613,8 @@
|
|
|
2598
2613
|
"capability": "CONTAINERS",
|
|
2599
2614
|
"description": "Enabled by default",
|
|
2600
2615
|
"subtype_modifier": [
|
|
2601
|
-
"Database"
|
|
2616
|
+
"Database",
|
|
2617
|
+
"Schema"
|
|
2602
2618
|
],
|
|
2603
2619
|
"supported": true
|
|
2604
2620
|
},
|
|
@@ -2812,6 +2828,15 @@
|
|
|
2812
2828
|
"description": "Enabled by default",
|
|
2813
2829
|
"subtype_modifier": null,
|
|
2814
2830
|
"supported": true
|
|
2831
|
+
},
|
|
2832
|
+
{
|
|
2833
|
+
"capability": "LINEAGE_COARSE",
|
|
2834
|
+
"description": "Extract table-level lineage for Salesforce objects",
|
|
2835
|
+
"subtype_modifier": [
|
|
2836
|
+
"Custom Object",
|
|
2837
|
+
"Object"
|
|
2838
|
+
],
|
|
2839
|
+
"supported": true
|
|
2815
2840
|
}
|
|
2816
2841
|
],
|
|
2817
2842
|
"classname": "datahub.ingestion.source.salesforce.SalesforceSource",
|
|
@@ -3207,7 +3232,9 @@
|
|
|
3207
3232
|
{
|
|
3208
3233
|
"capability": "CONTAINERS",
|
|
3209
3234
|
"description": "Enabled by default",
|
|
3210
|
-
"subtype_modifier":
|
|
3235
|
+
"subtype_modifier": [
|
|
3236
|
+
"Database"
|
|
3237
|
+
],
|
|
3211
3238
|
"supported": true
|
|
3212
3239
|
},
|
|
3213
3240
|
{
|
|
@@ -3339,8 +3366,9 @@
|
|
|
3339
3366
|
},
|
|
3340
3367
|
{
|
|
3341
3368
|
"capability": "LINEAGE_COARSE",
|
|
3342
|
-
"description": "
|
|
3369
|
+
"description": "Extract table-level lineage",
|
|
3343
3370
|
"subtype_modifier": [
|
|
3371
|
+
"Table",
|
|
3344
3372
|
"View"
|
|
3345
3373
|
],
|
|
3346
3374
|
"supported": true
|
|
@@ -76,7 +76,15 @@ from datahub.metadata.schema_classes import (
|
|
|
76
76
|
SystemMetadataClass,
|
|
77
77
|
TelemetryClientIdClass,
|
|
78
78
|
)
|
|
79
|
-
from datahub.metadata.urns import
|
|
79
|
+
from datahub.metadata.urns import (
|
|
80
|
+
CorpUserUrn,
|
|
81
|
+
MlFeatureTableUrn,
|
|
82
|
+
MlFeatureUrn,
|
|
83
|
+
MlModelGroupUrn,
|
|
84
|
+
MlModelUrn,
|
|
85
|
+
MlPrimaryKeyUrn,
|
|
86
|
+
Urn,
|
|
87
|
+
)
|
|
80
88
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
81
89
|
from datahub.utilities.perf_timer import PerfTimer
|
|
82
90
|
from datahub.utilities.str_enum import StrEnum
|
|
@@ -118,8 +126,16 @@ def entity_type_to_graphql(entity_type: str) -> str:
|
|
|
118
126
|
"""Convert the entity types into GraphQL "EntityType" enum values."""
|
|
119
127
|
|
|
120
128
|
# Hard-coded special cases.
|
|
121
|
-
|
|
122
|
-
|
|
129
|
+
special_cases = {
|
|
130
|
+
CorpUserUrn.ENTITY_TYPE: "CORP_USER",
|
|
131
|
+
MlModelUrn.ENTITY_TYPE: "MLMODEL",
|
|
132
|
+
MlModelGroupUrn.ENTITY_TYPE: "MLMODEL_GROUP",
|
|
133
|
+
MlFeatureTableUrn.ENTITY_TYPE: "MLFEATURE_TABLE",
|
|
134
|
+
MlFeatureUrn.ENTITY_TYPE: "MLFEATURE",
|
|
135
|
+
MlPrimaryKeyUrn.ENTITY_TYPE: "MLPRIMARY_KEY",
|
|
136
|
+
}
|
|
137
|
+
if entity_type in special_cases:
|
|
138
|
+
return special_cases[entity_type]
|
|
123
139
|
|
|
124
140
|
# Convert camelCase to UPPER_UNDERSCORE.
|
|
125
141
|
entity_type = (
|
|
@@ -92,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
92
92
|
@dataclasses.dataclass
|
|
93
93
|
class DataHubRestSinkReport(SinkReport):
|
|
94
94
|
mode: Optional[RestSinkMode] = None
|
|
95
|
+
endpoint: Optional[RestSinkEndpoint] = None
|
|
95
96
|
max_threads: Optional[int] = None
|
|
96
97
|
gms_version: Optional[str] = None
|
|
97
98
|
pending_requests: int = 0
|
|
@@ -142,6 +143,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
142
143
|
|
|
143
144
|
self.report.gms_version = gms_config.service_version
|
|
144
145
|
self.report.mode = self.config.mode
|
|
146
|
+
self.report.endpoint = self.config.endpoint
|
|
145
147
|
self.report.max_threads = self.config.max_threads
|
|
146
148
|
logger.debug("Setting env variables to override config")
|
|
147
149
|
logger.debug("Setting gms config")
|
|
@@ -151,7 +151,7 @@ class DataLakeSourceConfig(
|
|
|
151
151
|
raise ValueError("platform must not be empty")
|
|
152
152
|
return platform
|
|
153
153
|
|
|
154
|
-
@pydantic.root_validator()
|
|
154
|
+
@pydantic.root_validator(skip_on_failure=True)
|
|
155
155
|
def ensure_profiling_pattern_is_passed_to_profiling(
|
|
156
156
|
cls, values: Dict[str, Any]
|
|
157
157
|
) -> Dict[str, Any]:
|