acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -45,7 +45,7 @@ def _get_owner_urn(maybe_urn: str) -> str:
|
|
|
45
45
|
|
|
46
46
|
def _abort_if_non_existent_urn(graph: DataHubGraph, urn: str, operation: str) -> None:
|
|
47
47
|
try:
|
|
48
|
-
parsed_urn: Urn = Urn.
|
|
48
|
+
parsed_urn: Urn = Urn.from_string(urn)
|
|
49
49
|
entity_type = parsed_urn.get_type()
|
|
50
50
|
except Exception:
|
|
51
51
|
click.secho(f"Provided urn {urn} does not seem valid", fg="red")
|
|
@@ -31,7 +31,8 @@ def properties() -> None:
|
|
|
31
31
|
def upsert(file: Path) -> None:
|
|
32
32
|
"""Upsert structured properties in DataHub."""
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
with get_default_graph() as graph:
|
|
35
|
+
StructuredProperties.create(str(file), graph)
|
|
35
36
|
|
|
36
37
|
|
|
37
38
|
@properties.command(
|
datahub/configuration/common.py
CHANGED
|
@@ -258,7 +258,7 @@ class AllowDenyPattern(ConfigModel):
|
|
|
258
258
|
return AllowDenyPattern()
|
|
259
259
|
|
|
260
260
|
def allowed(self, string: str) -> bool:
|
|
261
|
-
if self.
|
|
261
|
+
if self.denied(string):
|
|
262
262
|
return False
|
|
263
263
|
|
|
264
264
|
return any(
|
|
@@ -266,7 +266,7 @@ class AllowDenyPattern(ConfigModel):
|
|
|
266
266
|
for allow_pattern in self.allow
|
|
267
267
|
)
|
|
268
268
|
|
|
269
|
-
def
|
|
269
|
+
def denied(self, string: str) -> bool:
|
|
270
270
|
for deny_pattern in self.deny:
|
|
271
271
|
if re.match(deny_pattern, string, self.regex_flags):
|
|
272
272
|
return True
|
|
@@ -290,7 +290,7 @@ class AllowDenyPattern(ConfigModel):
|
|
|
290
290
|
raise ValueError(
|
|
291
291
|
"allow list must be fully specified to get list of allowed strings"
|
|
292
292
|
)
|
|
293
|
-
return [a for a in self.allow if not self.
|
|
293
|
+
return [a for a in self.allow if not self.denied(a)]
|
|
294
294
|
|
|
295
295
|
def __eq__(self, other): # type: ignore
|
|
296
296
|
return isinstance(other, self.__class__) and self.__dict__ == other.__dict__
|
datahub/configuration/git.py
CHANGED
|
@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
|
|
|
24
24
|
"main",
|
|
25
25
|
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
|
|
26
26
|
)
|
|
27
|
-
|
|
27
|
+
url_subdir: Optional[str] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
|
|
30
|
+
"Only affects URL generation, not git operations.",
|
|
31
|
+
)
|
|
28
32
|
url_template: Optional[str] = Field(
|
|
29
33
|
None,
|
|
30
34
|
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
|
|
@@ -68,6 +72,8 @@ class GitReference(ConfigModel):
|
|
|
68
72
|
|
|
69
73
|
def get_url_for_file_path(self, file_path: str) -> str:
|
|
70
74
|
assert self.url_template
|
|
75
|
+
if self.url_subdir:
|
|
76
|
+
file_path = f"{self.url_subdir}/{file_path}"
|
|
71
77
|
return self.url_template.format(
|
|
72
78
|
repo_url=self.repo, branch=self.branch, file_path=file_path
|
|
73
79
|
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
3
4
|
|
|
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
|
|
|
34
35
|
"oauth_cb must be a string representing python function reference "
|
|
35
36
|
"in the format <python-module>:<function-name>."
|
|
36
37
|
)
|
|
38
|
+
|
|
39
|
+
call_back_fn = import_path(call_back)
|
|
40
|
+
self._validate_call_back_fn_signature(call_back_fn)
|
|
41
|
+
|
|
37
42
|
# Set the callback
|
|
38
|
-
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] =
|
|
43
|
+
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
|
|
44
|
+
|
|
45
|
+
def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
|
|
46
|
+
sig = inspect.signature(call_back_fn)
|
|
47
|
+
|
|
48
|
+
num_positional_args = len(
|
|
49
|
+
[
|
|
50
|
+
param
|
|
51
|
+
for param in sig.parameters.values()
|
|
52
|
+
if param.kind
|
|
53
|
+
in (
|
|
54
|
+
inspect.Parameter.POSITIONAL_ONLY,
|
|
55
|
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
|
56
|
+
)
|
|
57
|
+
and param.default == inspect.Parameter.empty
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
has_variadic_args = any(
|
|
62
|
+
param.kind == inspect.Parameter.VAR_POSITIONAL
|
|
63
|
+
for param in sig.parameters.values()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert num_positional_args == 1 or (
|
|
67
|
+
has_variadic_args and num_positional_args <= 1
|
|
68
|
+
), "oauth_cb function must accept single positional argument."
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import time
|
|
2
3
|
from collections import defaultdict
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
|
|
@@ -6,12 +7,15 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
|
|
|
6
7
|
from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
|
|
7
8
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
8
9
|
from datahub.metadata.schema_classes import (
|
|
10
|
+
AuditStampClass,
|
|
9
11
|
ChangeTypeClass,
|
|
12
|
+
EdgeClass,
|
|
10
13
|
GenericAspectClass,
|
|
11
14
|
KafkaAuditHeaderClass,
|
|
12
15
|
MetadataChangeProposalClass,
|
|
13
16
|
SystemMetadataClass,
|
|
14
17
|
)
|
|
18
|
+
from datahub.metadata.urns import Urn
|
|
15
19
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
16
20
|
|
|
17
21
|
|
|
@@ -89,3 +93,42 @@ class MetadataPatchProposal:
|
|
|
89
93
|
)
|
|
90
94
|
for aspect_name, patches in self.patches.items()
|
|
91
95
|
]
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def _mint_auditstamp(cls, message: Optional[str] = None) -> AuditStampClass:
|
|
99
|
+
"""
|
|
100
|
+
Creates an AuditStampClass instance with the current timestamp and other default values.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
message: The message associated with the audit stamp (optional).
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
An instance of AuditStampClass.
|
|
107
|
+
"""
|
|
108
|
+
return AuditStampClass(
|
|
109
|
+
time=int(time.time() * 1000.0),
|
|
110
|
+
actor="urn:li:corpuser:datahub",
|
|
111
|
+
message=message,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def _ensure_urn_type(
|
|
116
|
+
cls, entity_type: str, edges: List[EdgeClass], context: str
|
|
117
|
+
) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Ensures that the destination URNs in the given edges have the specified entity type.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
entity_type: The entity type to check against.
|
|
123
|
+
edges: A list of Edge objects.
|
|
124
|
+
context: The context or description of the operation.
|
|
125
|
+
|
|
126
|
+
Raises:
|
|
127
|
+
ValueError: If any of the destination URNs is not of the specified entity type.
|
|
128
|
+
"""
|
|
129
|
+
for e in edges:
|
|
130
|
+
urn = Urn.from_string(e.destinationUrn)
|
|
131
|
+
if not urn.entity_type == entity_type:
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"{context}: {e.destinationUrn} is not of type {entity_type}"
|
|
134
|
+
)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
46
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
-
|
|
49
|
+
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
+
# for overhead like request headers.
|
|
51
|
+
# This applies to pretty much all calls to GMS.
|
|
52
|
+
INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
|
|
53
|
+
|
|
54
|
+
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
55
|
+
# and return a 500 if processing takes too long. To avoid sending
|
|
56
|
+
# too much to the backend and hitting a timeout, we try to limit
|
|
57
|
+
# the number of MCPs we send in a batch.
|
|
58
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
59
|
+
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
|
|
60
|
+
)
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
290
300
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
291
301
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
292
302
|
mcp_obj_chunks: List[List[str]] = []
|
|
293
|
-
current_chunk_size =
|
|
303
|
+
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
294
304
|
for mcp_obj in mcp_objs:
|
|
295
305
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
296
306
|
|
|
297
|
-
if
|
|
307
|
+
if (
|
|
308
|
+
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
|
+
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
|
+
):
|
|
298
311
|
mcp_obj_chunks.append([])
|
|
299
312
|
current_chunk_size = 0
|
|
300
313
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic.fields import Field
|
|
5
|
+
|
|
6
|
+
from datahub.configuration.common import ConfigModel
|
|
7
|
+
from datahub.emitter.mce_builder import set_aspect
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
|
|
10
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
11
|
+
from datahub.metadata.schema_classes import (
|
|
12
|
+
DatasetPropertiesClass,
|
|
13
|
+
MetadataChangeEventClass,
|
|
14
|
+
SystemMetadataClass,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def convert_dataset_properties_to_patch(
|
|
21
|
+
urn: str,
|
|
22
|
+
aspect: DatasetPropertiesClass,
|
|
23
|
+
system_metadata: Optional[SystemMetadataClass],
|
|
24
|
+
) -> MetadataWorkUnit:
|
|
25
|
+
patch_builder = create_dataset_props_patch_builder(urn, aspect, system_metadata)
|
|
26
|
+
mcp = next(iter(patch_builder.build()))
|
|
27
|
+
return MetadataWorkUnit(id=MetadataWorkUnit.generate_workunit_id(mcp), mcp_raw=mcp)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def auto_incremental_properties(
|
|
31
|
+
incremental_properties: bool,
|
|
32
|
+
stream: Iterable[MetadataWorkUnit],
|
|
33
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
34
|
+
if not incremental_properties:
|
|
35
|
+
yield from stream
|
|
36
|
+
return # early exit
|
|
37
|
+
|
|
38
|
+
for wu in stream:
|
|
39
|
+
urn = wu.get_urn()
|
|
40
|
+
|
|
41
|
+
if isinstance(wu.metadata, MetadataChangeEventClass):
|
|
42
|
+
properties_aspect = wu.get_aspect_of_type(DatasetPropertiesClass)
|
|
43
|
+
set_aspect(wu.metadata, None, DatasetPropertiesClass)
|
|
44
|
+
if len(wu.metadata.proposedSnapshot.aspects) > 0:
|
|
45
|
+
yield wu
|
|
46
|
+
|
|
47
|
+
if properties_aspect:
|
|
48
|
+
yield convert_dataset_properties_to_patch(
|
|
49
|
+
urn, properties_aspect, wu.metadata.systemMetadata
|
|
50
|
+
)
|
|
51
|
+
elif isinstance(wu.metadata, MetadataChangeProposalWrapper) and isinstance(
|
|
52
|
+
wu.metadata.aspect, DatasetPropertiesClass
|
|
53
|
+
):
|
|
54
|
+
properties_aspect = wu.metadata.aspect
|
|
55
|
+
if properties_aspect:
|
|
56
|
+
yield convert_dataset_properties_to_patch(
|
|
57
|
+
urn, properties_aspect, wu.metadata.systemMetadata
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
yield wu
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# TODO: Use this in SQLCommonConfig. Currently only used in snowflake
|
|
64
|
+
class IncrementalPropertiesConfigMixin(ConfigModel):
|
|
65
|
+
incremental_properties: bool = Field(
|
|
66
|
+
default=False,
|
|
67
|
+
description="When enabled, emits dataset properties as incremental to existing dataset properties "
|
|
68
|
+
"in DataHub. When disabled, re-states dataset properties on each run.",
|
|
69
|
+
)
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -184,6 +184,7 @@ class StructuredLogs(Report):
|
|
|
184
184
|
|
|
185
185
|
@dataclass
|
|
186
186
|
class SourceReport(Report):
|
|
187
|
+
event_not_produced_warn: bool = True
|
|
187
188
|
events_produced: int = 0
|
|
188
189
|
events_produced_per_sec: int = 0
|
|
189
190
|
|
|
@@ -492,11 +493,15 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
492
493
|
|
|
493
494
|
def _infer_platform(self) -> Optional[str]:
|
|
494
495
|
config = self.get_config()
|
|
495
|
-
|
|
496
|
+
platform = (
|
|
496
497
|
getattr(config, "platform_name", None)
|
|
497
498
|
or getattr(self, "platform", None)
|
|
498
499
|
or getattr(config, "platform", None)
|
|
499
500
|
)
|
|
501
|
+
if platform is None and hasattr(self, "get_platform_id"):
|
|
502
|
+
platform = type(self).get_platform_id()
|
|
503
|
+
|
|
504
|
+
return platform
|
|
500
505
|
|
|
501
506
|
def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
|
|
502
507
|
config = self.get_config()
|
|
@@ -32,6 +32,7 @@ from datahub.metadata.schema_classes import (
|
|
|
32
32
|
SchemaFieldClass,
|
|
33
33
|
SchemaMetadataClass,
|
|
34
34
|
StatusClass,
|
|
35
|
+
SystemMetadataClass,
|
|
35
36
|
TimeWindowSizeClass,
|
|
36
37
|
)
|
|
37
38
|
from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
|
|
@@ -65,9 +66,10 @@ def auto_workunit(
|
|
|
65
66
|
def create_dataset_props_patch_builder(
|
|
66
67
|
dataset_urn: str,
|
|
67
68
|
dataset_properties: DatasetPropertiesClass,
|
|
69
|
+
system_metadata: Optional[SystemMetadataClass] = None,
|
|
68
70
|
) -> DatasetPatchBuilder:
|
|
69
71
|
"""Creates a patch builder with a table's or view's attributes and dataset properties"""
|
|
70
|
-
patch_builder = DatasetPatchBuilder(dataset_urn)
|
|
72
|
+
patch_builder = DatasetPatchBuilder(dataset_urn, system_metadata)
|
|
71
73
|
patch_builder.set_display_name(dataset_properties.name)
|
|
72
74
|
patch_builder.set_description(dataset_properties.description)
|
|
73
75
|
patch_builder.set_created(dataset_properties.created)
|
|
@@ -148,7 +150,7 @@ def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Itera
|
|
|
148
150
|
report.report_workunit(wu)
|
|
149
151
|
yield wu
|
|
150
152
|
|
|
151
|
-
if report.events_produced == 0:
|
|
153
|
+
if report.event_not_produced_warn and report.events_produced == 0:
|
|
152
154
|
report.warning(
|
|
153
155
|
title="No metadata was produced by the source",
|
|
154
156
|
message="Please check the source configuration, filters, and permissions.",
|
|
@@ -67,6 +67,7 @@ from datahub.metadata.schema_classes import (
|
|
|
67
67
|
SystemMetadataClass,
|
|
68
68
|
TelemetryClientIdClass,
|
|
69
69
|
)
|
|
70
|
+
from datahub.telemetry.telemetry import telemetry_instance
|
|
70
71
|
from datahub.utilities.perf_timer import PerfTimer
|
|
71
72
|
from datahub.utilities.str_enum import StrEnum
|
|
72
73
|
from datahub.utilities.urns.urn import Urn, guess_entity_type
|
|
@@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph:
|
|
|
1819
1820
|
graph_config = config_utils.load_client_config()
|
|
1820
1821
|
graph = DataHubGraph(graph_config)
|
|
1821
1822
|
graph.test_connection()
|
|
1823
|
+
telemetry_instance.set_context(server=graph)
|
|
1822
1824
|
return graph
|
|
@@ -148,10 +148,10 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
148
148
|
|
|
149
149
|
def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
|
|
150
150
|
assert ctx.pipeline_config
|
|
151
|
-
if not self.report_recipe or not ctx.pipeline_config.
|
|
151
|
+
if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
|
|
152
152
|
return ""
|
|
153
153
|
else:
|
|
154
|
-
return json.dumps(redact_raw_config(ctx.pipeline_config.
|
|
154
|
+
return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
|
|
155
155
|
|
|
156
156
|
def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
|
|
157
157
|
self.sink.write_record_async(
|
|
@@ -44,7 +44,8 @@ from datahub.ingestion.transformer.system_metadata_transformer import (
|
|
|
44
44
|
)
|
|
45
45
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
46
46
|
from datahub.metadata.schema_classes import MetadataChangeProposalClass
|
|
47
|
-
from datahub.telemetry import stats
|
|
47
|
+
from datahub.telemetry import stats
|
|
48
|
+
from datahub.telemetry.telemetry import telemetry_instance
|
|
48
49
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
49
50
|
from datahub.utilities.global_warning_util import (
|
|
50
51
|
clear_global_warnings,
|
|
@@ -220,7 +221,7 @@ class Pipeline:
|
|
|
220
221
|
dry_run: bool = False,
|
|
221
222
|
preview_mode: bool = False,
|
|
222
223
|
preview_workunits: int = 10,
|
|
223
|
-
report_to: Optional[str] =
|
|
224
|
+
report_to: Optional[str] = "datahub",
|
|
224
225
|
no_progress: bool = False,
|
|
225
226
|
):
|
|
226
227
|
self.config = config
|
|
@@ -273,8 +274,9 @@ class Pipeline:
|
|
|
273
274
|
if self.graph is None and isinstance(self.sink, DatahubRestSink):
|
|
274
275
|
with _add_init_error_context("setup default datahub client"):
|
|
275
276
|
self.graph = self.sink.emitter.to_graph()
|
|
277
|
+
self.graph.test_connection()
|
|
276
278
|
self.ctx.graph = self.graph
|
|
277
|
-
|
|
279
|
+
telemetry_instance.set_context(server=self.graph)
|
|
278
280
|
|
|
279
281
|
with set_graph_context(self.graph):
|
|
280
282
|
with _add_init_error_context("configure reporters"):
|
|
@@ -615,7 +617,7 @@ class Pipeline:
|
|
|
615
617
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
616
618
|
global_warnings = len(get_global_warnings())
|
|
617
619
|
|
|
618
|
-
|
|
620
|
+
telemetry_instance.ping(
|
|
619
621
|
"ingest_stats",
|
|
620
622
|
{
|
|
621
623
|
"source_type": self.source_type,
|
|
@@ -637,7 +639,6 @@ class Pipeline:
|
|
|
637
639
|
),
|
|
638
640
|
"has_pipeline_name": bool(self.config.pipeline_name),
|
|
639
641
|
},
|
|
640
|
-
self.ctx.graph,
|
|
641
642
|
)
|
|
642
643
|
|
|
643
644
|
def _approx_all_vals(self, d: LossyList[Any]) -> int:
|
|
@@ -117,3 +117,9 @@ class PipelineConfig(ConfigModel):
|
|
|
117
117
|
config = cls.parse_obj(resolved_dict)
|
|
118
118
|
config._raw_dict = raw_dict
|
|
119
119
|
return config
|
|
120
|
+
|
|
121
|
+
def get_raw_dict(self) -> Dict:
|
|
122
|
+
result = self._raw_dict
|
|
123
|
+
if result is None:
|
|
124
|
+
result = self.dict()
|
|
125
|
+
return result
|
|
@@ -18,7 +18,10 @@ from datahub.configuration.common import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
20
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
|
-
from datahub.emitter.rest_emitter import
|
|
21
|
+
from datahub.emitter.rest_emitter import (
|
|
22
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
|
+
DataHubRestEmitter,
|
|
24
|
+
)
|
|
22
25
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
23
26
|
from datahub.ingestion.api.sink import (
|
|
24
27
|
NoopWriteCallback,
|
|
@@ -65,11 +68,19 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
65
68
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
66
69
|
|
|
67
70
|
# These only apply in async modes.
|
|
68
|
-
max_threads:
|
|
69
|
-
max_pending_requests:
|
|
71
|
+
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
72
|
+
max_pending_requests: pydantic.PositiveInt = 2000
|
|
70
73
|
|
|
71
74
|
# Only applies in async batch mode.
|
|
72
|
-
max_per_batch:
|
|
75
|
+
max_per_batch: pydantic.PositiveInt = 100
|
|
76
|
+
|
|
77
|
+
@pydantic.validator("max_per_batch", always=True)
|
|
78
|
+
def validate_max_per_batch(cls, v):
|
|
79
|
+
if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
|
|
82
|
+
)
|
|
83
|
+
return v
|
|
73
84
|
|
|
74
85
|
|
|
75
86
|
@dataclasses.dataclass
|
|
@@ -201,6 +201,10 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
201
201
|
).infer_schema(file)
|
|
202
202
|
elif extension == ".json":
|
|
203
203
|
fields = json.JsonInferrer().infer_schema(file)
|
|
204
|
+
elif extension == ".jsonl":
|
|
205
|
+
fields = json.JsonInferrer(
|
|
206
|
+
max_rows=self.source_config.max_rows, format="jsonl"
|
|
207
|
+
).infer_schema(file)
|
|
204
208
|
elif extension == ".avro":
|
|
205
209
|
fields = avro.AvroInferrer().infer_schema(file)
|
|
206
210
|
else:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from datetime import datetime, timedelta, timezone
|
|
2
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
import boto3
|
|
5
5
|
from boto3.session import Session
|
|
@@ -107,6 +107,14 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
107
107
|
default=None,
|
|
108
108
|
description="A set of proxy configs to use with AWS. See the [botocore.config](https://botocore.amazonaws.com/v1/documentation/api/latest/reference/config.html) docs for details.",
|
|
109
109
|
)
|
|
110
|
+
aws_retry_num: int = Field(
|
|
111
|
+
default=5,
|
|
112
|
+
description="Number of times to retry failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
|
|
113
|
+
)
|
|
114
|
+
aws_retry_mode: Literal["legacy", "standard", "adaptive"] = Field(
|
|
115
|
+
default="standard",
|
|
116
|
+
description="Retry mode to use for failed AWS requests. See the [botocore.retry](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) docs for details.",
|
|
117
|
+
)
|
|
110
118
|
|
|
111
119
|
read_timeout: float = Field(
|
|
112
120
|
default=DEFAULT_TIMEOUT,
|
|
@@ -199,6 +207,10 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
199
207
|
return Config(
|
|
200
208
|
proxies=self.aws_proxy,
|
|
201
209
|
read_timeout=self.read_timeout,
|
|
210
|
+
retries={
|
|
211
|
+
"max_attempts": self.aws_retry_num,
|
|
212
|
+
"mode": self.aws_retry_mode,
|
|
213
|
+
},
|
|
202
214
|
**self.aws_advanced_config,
|
|
203
215
|
)
|
|
204
216
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional
|
|
3
4
|
|
|
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
36
37
|
if TYPE_CHECKING:
|
|
37
38
|
from mypy_boto3_sagemaker import SageMakerClient
|
|
38
39
|
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
39
42
|
|
|
40
43
|
@platform_name("SageMaker")
|
|
41
44
|
@config_class(SagemakerSourceConfig)
|
|
@@ -75,6 +78,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
|
|
|
75
78
|
]
|
|
76
79
|
|
|
77
80
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
81
|
+
logger.info("Starting SageMaker ingestion...")
|
|
78
82
|
# get common lineage graph
|
|
79
83
|
lineage_processor = LineageProcessor(
|
|
80
84
|
sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
|
|
@@ -83,6 +87,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
|
|
|
83
87
|
|
|
84
88
|
# extract feature groups if specified
|
|
85
89
|
if self.source_config.extract_feature_groups:
|
|
90
|
+
logger.info("Extracting feature groups...")
|
|
86
91
|
feature_group_processor = FeatureGroupProcessor(
|
|
87
92
|
sagemaker_client=self.sagemaker_client, env=self.env, report=self.report
|
|
88
93
|
)
|
|
@@ -95,6 +100,7 @@ class SagemakerSource(StatefulIngestionSourceBase):
|
|
|
95
100
|
|
|
96
101
|
# extract jobs if specified
|
|
97
102
|
if self.source_config.extract_jobs is not False:
|
|
103
|
+
logger.info("Extracting jobs...")
|
|
98
104
|
job_processor = JobProcessor(
|
|
99
105
|
sagemaker_client=self.client_factory.get_client,
|
|
100
106
|
env=self.env,
|
|
@@ -109,6 +115,8 @@ class SagemakerSource(StatefulIngestionSourceBase):
|
|
|
109
115
|
|
|
110
116
|
# extract models if specified
|
|
111
117
|
if self.source_config.extract_models:
|
|
118
|
+
logger.info("Extracting models...")
|
|
119
|
+
|
|
112
120
|
model_processor = ModelProcessor(
|
|
113
121
|
sagemaker_client=self.sagemaker_client,
|
|
114
122
|
env=self.env,
|
|
@@ -40,8 +40,11 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
|
|
|
40
40
|
groups_scanned = 0
|
|
41
41
|
models_scanned = 0
|
|
42
42
|
jobs_scanned = 0
|
|
43
|
+
jobs_processed = 0
|
|
43
44
|
datasets_scanned = 0
|
|
44
45
|
filtered: List[str] = field(default_factory=list)
|
|
46
|
+
model_endpoint_lineage = 0
|
|
47
|
+
model_group_lineage = 0
|
|
45
48
|
|
|
46
49
|
def report_feature_group_scanned(self) -> None:
|
|
47
50
|
self.feature_groups_scanned += 1
|
|
@@ -58,6 +61,9 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
|
|
|
58
61
|
def report_model_scanned(self) -> None:
|
|
59
62
|
self.models_scanned += 1
|
|
60
63
|
|
|
64
|
+
def report_job_processed(self) -> None:
|
|
65
|
+
self.jobs_processed += 1
|
|
66
|
+
|
|
61
67
|
def report_job_scanned(self) -> None:
|
|
62
68
|
self.jobs_scanned += 1
|
|
63
69
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import textwrap
|
|
1
3
|
from dataclasses import dataclass
|
|
2
4
|
from typing import TYPE_CHECKING, Iterable, List
|
|
3
5
|
|
|
@@ -28,6 +30,8 @@ if TYPE_CHECKING:
|
|
|
28
30
|
FeatureGroupSummaryTypeDef,
|
|
29
31
|
)
|
|
30
32
|
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
@dataclass
|
|
33
37
|
class FeatureGroupProcessor:
|
|
@@ -197,11 +201,12 @@ class FeatureGroupProcessor:
|
|
|
197
201
|
|
|
198
202
|
full_table_name = f"{glue_database}.{glue_table}"
|
|
199
203
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
204
|
+
logging.info(
|
|
205
|
+
textwrap.dedent(
|
|
206
|
+
f"""Note: table {full_table_name} is an AWS Glue object. This source does not ingest all metadata for Glue tables.
|
|
203
207
|
To view full table metadata, run Glue ingestion
|
|
204
|
-
(see https://datahubproject.io/docs/
|
|
208
|
+
(see https://datahubproject.io/docs/generated/ingestion/sources/glue)"""
|
|
209
|
+
)
|
|
205
210
|
)
|
|
206
211
|
|
|
207
212
|
feature_sources.append(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from enum import Enum
|
|
@@ -49,6 +50,8 @@ from datahub.metadata.schema_classes import (
|
|
|
49
50
|
if TYPE_CHECKING:
|
|
50
51
|
from mypy_boto3_sagemaker import SageMakerClient
|
|
51
52
|
|
|
53
|
+
logger = logging.getLogger(__name__)
|
|
54
|
+
|
|
52
55
|
JobInfo = TypeVar(
|
|
53
56
|
"JobInfo",
|
|
54
57
|
AutoMlJobInfo,
|
|
@@ -274,15 +277,18 @@ class JobProcessor:
|
|
|
274
277
|
)
|
|
275
278
|
|
|
276
279
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
280
|
+
logger.info("Getting all SageMaker jobs")
|
|
277
281
|
jobs = self.get_all_jobs()
|
|
278
282
|
|
|
279
283
|
processed_jobs: Dict[str, SageMakerJob] = {}
|
|
280
284
|
|
|
285
|
+
logger.info("Processing SageMaker jobs")
|
|
281
286
|
# first pass: process jobs and collect datasets used
|
|
287
|
+
logger.info("first pass: process jobs and collect datasets used")
|
|
282
288
|
for job in jobs:
|
|
283
289
|
job_type = job_type_to_info[job["type"]]
|
|
284
290
|
job_name = job[job_type.list_name_key]
|
|
285
|
-
|
|
291
|
+
logger.debug(f"Processing job {job_name} with type {job_type}")
|
|
286
292
|
job_details = self.get_job_details(job_name, job["type"])
|
|
287
293
|
|
|
288
294
|
processed_job = getattr(self, job_type.processor)(job_details)
|
|
@@ -293,6 +299,9 @@ class JobProcessor:
|
|
|
293
299
|
# second pass:
|
|
294
300
|
# - move output jobs to inputs
|
|
295
301
|
# - aggregate i/o datasets
|
|
302
|
+
logger.info(
|
|
303
|
+
"second pass: move output jobs to inputs and aggregate i/o datasets"
|
|
304
|
+
)
|
|
296
305
|
for job_urn in sorted(processed_jobs):
|
|
297
306
|
processed_job = processed_jobs[job_urn]
|
|
298
307
|
|
|
@@ -301,6 +310,7 @@ class JobProcessor:
|
|
|
301
310
|
|
|
302
311
|
all_datasets.update(processed_job.input_datasets)
|
|
303
312
|
all_datasets.update(processed_job.output_datasets)
|
|
313
|
+
self.report.report_job_processed()
|
|
304
314
|
|
|
305
315
|
# yield datasets
|
|
306
316
|
for dataset_urn, dataset in all_datasets.items():
|
|
@@ -322,6 +332,7 @@ class JobProcessor:
|
|
|
322
332
|
self.report.report_dataset_scanned()
|
|
323
333
|
|
|
324
334
|
# third pass: construct and yield MCEs
|
|
335
|
+
logger.info("third pass: construct and yield MCEs")
|
|
325
336
|
for job_urn in sorted(processed_jobs):
|
|
326
337
|
processed_job = processed_jobs[job_urn]
|
|
327
338
|
job_snapshot = processed_job.job_snapshot
|