acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from datetime import datetime, timedelta
|
|
3
|
+
from datetime import datetime, timedelta, timezone
|
|
4
4
|
from typing import Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import Field
|
|
@@ -10,6 +10,7 @@ from datahub.api.circuit_breaker.circuit_breaker import (
|
|
|
10
10
|
CircuitBreakerConfig,
|
|
11
11
|
)
|
|
12
12
|
from datahub.api.graphql import Assertion, Operation
|
|
13
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
13
14
|
|
|
14
15
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
@@ -49,7 +50,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
|
|
|
49
50
|
if not operations:
|
|
50
51
|
return None
|
|
51
52
|
else:
|
|
52
|
-
return
|
|
53
|
+
return parse_ts_millis(operations[0]["lastUpdatedTimestamp"])
|
|
53
54
|
|
|
54
55
|
def _check_if_assertion_failed(
|
|
55
56
|
self, assertions: List[Dict[str, Any]], last_updated: Optional[datetime] = None
|
|
@@ -93,7 +94,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
|
|
|
93
94
|
logger.info(f"Found successful assertion: {assertion_urn}")
|
|
94
95
|
result = False
|
|
95
96
|
if last_updated is not None:
|
|
96
|
-
last_run =
|
|
97
|
+
last_run = parse_ts_millis(last_assertion.time)
|
|
97
98
|
if last_updated > last_run:
|
|
98
99
|
logger.error(
|
|
99
100
|
f"Missing assertion run for {assertion_urn}. The dataset was updated on {last_updated} but the last assertion run was at {last_run}"
|
|
@@ -117,7 +118,7 @@ class AssertionCircuitBreaker(AbstractCircuitBreaker):
|
|
|
117
118
|
)
|
|
118
119
|
|
|
119
120
|
if not last_updated:
|
|
120
|
-
last_updated = datetime.now() - self.config.time_delta
|
|
121
|
+
last_updated = datetime.now(tz=timezone.utc) - self.config.time_delta
|
|
121
122
|
logger.info(
|
|
122
123
|
f"Dataset {urn} doesn't have last updated or check_last_assertion_time is false, using calculated min assertion date {last_updated}"
|
|
123
124
|
)
|
|
@@ -14,7 +14,7 @@ from datahub.metadata.schema_classes import (
|
|
|
14
14
|
PropertyValueClass,
|
|
15
15
|
StructuredPropertyDefinitionClass,
|
|
16
16
|
)
|
|
17
|
-
from datahub.metadata.urns import StructuredPropertyUrn, Urn
|
|
17
|
+
from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
|
|
18
18
|
from datahub.utilities.urns._urn_base import URN_TYPES
|
|
19
19
|
|
|
20
20
|
logging.basicConfig(level=logging.INFO)
|
|
@@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel):
|
|
|
86
86
|
|
|
87
87
|
@validator("type")
|
|
88
88
|
def validate_type(cls, v: str) -> str:
|
|
89
|
-
#
|
|
90
|
-
|
|
89
|
+
# This logic is somewhat hacky, since we need to deal with
|
|
90
|
+
# 1. fully qualified urns
|
|
91
|
+
# 2. raw data types, that need to get the datahub namespace prefix
|
|
92
|
+
# While keeping the user-facing interface and error messages clean.
|
|
93
|
+
|
|
94
|
+
if not v.startswith("urn:li:") and not v.islower():
|
|
95
|
+
# Convert to lowercase if needed
|
|
96
|
+
v = v.lower()
|
|
91
97
|
logger.warning(
|
|
92
|
-
f"Structured property type should be lowercase. Updated to {v
|
|
98
|
+
f"Structured property type should be lowercase. Updated to {v}"
|
|
93
99
|
)
|
|
94
|
-
|
|
100
|
+
|
|
101
|
+
urn = Urn.make_data_type_urn(v)
|
|
95
102
|
|
|
96
103
|
# Check if type is allowed
|
|
97
|
-
|
|
104
|
+
data_type_urn = DataTypeUrn.from_string(urn)
|
|
105
|
+
unqualified_data_type = data_type_urn.id
|
|
106
|
+
if unqualified_data_type.startswith("datahub."):
|
|
107
|
+
unqualified_data_type = unqualified_data_type[len("datahub.") :]
|
|
108
|
+
if not AllowedTypes.check_allowed_type(unqualified_data_type):
|
|
98
109
|
raise ValueError(
|
|
99
|
-
f"Type {
|
|
110
|
+
f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
100
111
|
)
|
|
101
|
-
|
|
112
|
+
|
|
113
|
+
return urn
|
|
102
114
|
|
|
103
115
|
@property
|
|
104
116
|
def fqn(self) -> str:
|
datahub/configuration/common.py
CHANGED
|
@@ -10,7 +10,6 @@ from typing import (
|
|
|
10
10
|
List,
|
|
11
11
|
Optional,
|
|
12
12
|
Type,
|
|
13
|
-
TypeVar,
|
|
14
13
|
Union,
|
|
15
14
|
runtime_checkable,
|
|
16
15
|
)
|
|
@@ -19,14 +18,12 @@ import pydantic
|
|
|
19
18
|
from cached_property import cached_property
|
|
20
19
|
from pydantic import BaseModel, Extra, ValidationError
|
|
21
20
|
from pydantic.fields import Field
|
|
22
|
-
from typing_extensions import Protocol
|
|
21
|
+
from typing_extensions import Protocol, Self
|
|
23
22
|
|
|
24
23
|
from datahub.configuration._config_enum import ConfigEnum as ConfigEnum # noqa: I250
|
|
25
24
|
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
26
25
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
27
26
|
|
|
28
|
-
_ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel")
|
|
29
|
-
|
|
30
27
|
REDACT_KEYS = {
|
|
31
28
|
"password",
|
|
32
29
|
"token",
|
|
@@ -109,7 +106,7 @@ class ConfigModel(BaseModel):
|
|
|
109
106
|
schema_extra = _schema_extra
|
|
110
107
|
|
|
111
108
|
@classmethod
|
|
112
|
-
def parse_obj_allow_extras(cls
|
|
109
|
+
def parse_obj_allow_extras(cls, obj: Any) -> Self:
|
|
113
110
|
if PYDANTIC_VERSION_2:
|
|
114
111
|
try:
|
|
115
112
|
with unittest.mock.patch.dict(
|
|
@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
|
|
|
63
63
|
default=None,
|
|
64
64
|
description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
|
|
65
65
|
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PlatformDetail(ConfigModel):
|
|
69
|
+
platform_instance: Optional[str] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
|
|
72
|
+
"with platform instance name used in ingestion "
|
|
73
|
+
"recipe of other datahub sources.",
|
|
74
|
+
)
|
|
75
|
+
env: str = Field(
|
|
76
|
+
default=DEFAULT_ENV,
|
|
77
|
+
description="The environment that all assets produced by DataHub platform ingestion source belong to",
|
|
78
|
+
)
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -6,7 +6,7 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import time
|
|
9
|
-
from datetime import datetime
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import (
|
|
12
12
|
TYPE_CHECKING,
|
|
@@ -24,6 +24,7 @@ from typing import (
|
|
|
24
24
|
|
|
25
25
|
import typing_inspect
|
|
26
26
|
from avrogen.dict_wrapper import DictWrapper
|
|
27
|
+
from typing_extensions import assert_never
|
|
27
28
|
|
|
28
29
|
from datahub.emitter.enum_helpers import get_enum_options
|
|
29
30
|
from datahub.metadata.schema_classes import (
|
|
@@ -103,6 +104,22 @@ def make_ts_millis(ts: Optional[datetime]) -> Optional[int]:
|
|
|
103
104
|
return int(ts.timestamp() * 1000)
|
|
104
105
|
|
|
105
106
|
|
|
107
|
+
@overload
|
|
108
|
+
def parse_ts_millis(ts: float) -> datetime:
|
|
109
|
+
...
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@overload
|
|
113
|
+
def parse_ts_millis(ts: None) -> None:
|
|
114
|
+
...
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
|
118
|
+
if ts is None:
|
|
119
|
+
return None
|
|
120
|
+
return datetime.fromtimestamp(ts / 1000, tz=timezone.utc)
|
|
121
|
+
|
|
122
|
+
|
|
106
123
|
def make_data_platform_urn(platform: str) -> str:
|
|
107
124
|
if platform.startswith("urn:li:dataPlatform:"):
|
|
108
125
|
return platform
|
|
@@ -253,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
|
|
|
253
270
|
return make_user_urn(owner)
|
|
254
271
|
elif owner_type == OwnerType.GROUP:
|
|
255
272
|
return make_group_urn(owner)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
return f"urn:li:{owner_type.value}:{owner}"
|
|
273
|
+
else:
|
|
274
|
+
assert_never(owner_type)
|
|
259
275
|
|
|
260
276
|
|
|
261
277
|
def make_ownership_type_urn(type: str) -> str:
|
datahub/emitter/mcp_builder.py
CHANGED
|
@@ -4,8 +4,8 @@ from pydantic.fields import Field
|
|
|
4
4
|
from pydantic.main import BaseModel
|
|
5
5
|
|
|
6
6
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
7
|
-
from datahub.emitter.enum_helpers import get_enum_options
|
|
8
7
|
from datahub.emitter.mce_builder import (
|
|
8
|
+
ALL_ENV_TYPES,
|
|
9
9
|
Aspect,
|
|
10
10
|
datahub_guid,
|
|
11
11
|
make_container_urn,
|
|
@@ -25,7 +25,6 @@ from datahub.metadata.schema_classes import (
|
|
|
25
25
|
ContainerClass,
|
|
26
26
|
DomainsClass,
|
|
27
27
|
EmbedClass,
|
|
28
|
-
FabricTypeClass,
|
|
29
28
|
GlobalTagsClass,
|
|
30
29
|
MetadataChangeEventClass,
|
|
31
30
|
OwnerClass,
|
|
@@ -206,11 +205,7 @@ def gen_containers(
|
|
|
206
205
|
# Extra validation on the env field.
|
|
207
206
|
# In certain cases (mainly for backwards compatibility), the env field will actually
|
|
208
207
|
# have a platform instance name.
|
|
209
|
-
env =
|
|
210
|
-
container_key.env
|
|
211
|
-
if container_key.env in get_enum_options(FabricTypeClass)
|
|
212
|
-
else None
|
|
213
|
-
)
|
|
208
|
+
env = container_key.env if container_key.env in ALL_ENV_TYPES else None
|
|
214
209
|
|
|
215
210
|
container_urn = container_key.as_urn()
|
|
216
211
|
|
|
@@ -2,7 +2,19 @@ import json
|
|
|
2
2
|
import time
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Dict,
|
|
8
|
+
List,
|
|
9
|
+
Literal,
|
|
10
|
+
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Tuple,
|
|
13
|
+
Union,
|
|
14
|
+
runtime_checkable,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from typing_extensions import LiteralString
|
|
6
18
|
|
|
7
19
|
from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
|
|
8
20
|
from datahub.emitter.serialization_helper import pre_json_transform
|
|
@@ -19,25 +31,36 @@ from datahub.metadata.urns import Urn
|
|
|
19
31
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
20
32
|
|
|
21
33
|
|
|
34
|
+
@runtime_checkable
|
|
35
|
+
class SupportsToObj(Protocol):
|
|
36
|
+
def to_obj(self) -> Any:
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
22
40
|
def _recursive_to_obj(obj: Any) -> Any:
|
|
23
41
|
if isinstance(obj, list):
|
|
24
42
|
return [_recursive_to_obj(v) for v in obj]
|
|
25
|
-
elif
|
|
43
|
+
elif isinstance(obj, SupportsToObj):
|
|
26
44
|
return obj.to_obj()
|
|
27
45
|
else:
|
|
28
46
|
return obj
|
|
29
47
|
|
|
30
48
|
|
|
49
|
+
PatchPath = Tuple[Union[LiteralString, Urn], ...]
|
|
50
|
+
PatchOp = Literal["add", "remove", "replace"]
|
|
51
|
+
|
|
52
|
+
|
|
31
53
|
@dataclass
|
|
32
|
-
class _Patch:
|
|
33
|
-
op:
|
|
34
|
-
path:
|
|
54
|
+
class _Patch(SupportsToObj):
|
|
55
|
+
op: PatchOp
|
|
56
|
+
path: PatchPath
|
|
35
57
|
value: Any
|
|
36
58
|
|
|
37
59
|
def to_obj(self) -> Dict:
|
|
60
|
+
quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path)
|
|
38
61
|
return {
|
|
39
62
|
"op": self.op,
|
|
40
|
-
"path":
|
|
63
|
+
"path": quoted_path,
|
|
41
64
|
"value": _recursive_to_obj(self.value),
|
|
42
65
|
}
|
|
43
66
|
|
|
@@ -63,19 +86,20 @@ class MetadataPatchProposal:
|
|
|
63
86
|
|
|
64
87
|
# Json Patch quoting based on https://jsonpatch.com/#json-pointer
|
|
65
88
|
@classmethod
|
|
66
|
-
def quote(cls, value: str) -> str:
|
|
67
|
-
return value.replace("~", "~0").replace("/", "~1")
|
|
89
|
+
def quote(cls, value: Union[str, Urn]) -> str:
|
|
90
|
+
return str(value).replace("~", "~0").replace("/", "~1")
|
|
68
91
|
|
|
69
92
|
def _add_patch(
|
|
70
|
-
self,
|
|
93
|
+
self,
|
|
94
|
+
aspect_name: str,
|
|
95
|
+
op: PatchOp,
|
|
96
|
+
path: PatchPath,
|
|
97
|
+
value: Any,
|
|
71
98
|
) -> None:
|
|
72
|
-
if not isinstance(path, str):
|
|
73
|
-
path = "/" + "/".join(self.quote(p) for p in path)
|
|
74
|
-
|
|
75
99
|
# TODO: Validate that aspectName is a valid aspect for this entityType
|
|
76
100
|
self.patches[aspect_name].append(_Patch(op, path, value))
|
|
77
101
|
|
|
78
|
-
def build(self) ->
|
|
102
|
+
def build(self) -> List[MetadataChangeProposalClass]:
|
|
79
103
|
return [
|
|
80
104
|
MetadataChangeProposalClass(
|
|
81
105
|
entityUrn=self.urn,
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from json.decoder import JSONDecodeError
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
from deprecated import deprecated
|
|
@@ -13,6 +13,7 @@ from requests.exceptions import HTTPError, RequestException
|
|
|
13
13
|
from datahub import nice_version_name
|
|
14
14
|
from datahub.cli import config_utils
|
|
15
15
|
from datahub.cli.cli_utils import ensure_has_system_metadata, fixup_gms_url
|
|
16
|
+
from datahub.cli.env_utils import get_boolean_env_variable
|
|
16
17
|
from datahub.configuration.common import ConfigurationError, OperationalError
|
|
17
18
|
from datahub.emitter.generic_emitter import Emitter
|
|
18
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -46,6 +47,8 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
47
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
48
|
)
|
|
48
49
|
|
|
50
|
+
_DATAHUB_EMITTER_TRACE = get_boolean_env_variable("DATAHUB_EMITTER_TRACE", False)
|
|
51
|
+
|
|
49
52
|
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
53
|
# for overhead like request headers.
|
|
51
54
|
# This applies to pretty much all calls to GMS.
|
|
@@ -288,9 +291,11 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
288
291
|
|
|
289
292
|
def emit_mcps(
|
|
290
293
|
self,
|
|
291
|
-
mcps:
|
|
294
|
+
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
295
|
async_flag: Optional[bool] = None,
|
|
293
296
|
) -> int:
|
|
297
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
298
|
+
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")
|
|
294
299
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
295
300
|
for mcp in mcps:
|
|
296
301
|
ensure_has_system_metadata(mcp)
|
|
@@ -303,15 +308,25 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
303
308
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
304
309
|
for mcp_obj in mcp_objs:
|
|
305
310
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
311
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
312
|
+
logger.debug(
|
|
313
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
314
|
+
)
|
|
306
315
|
|
|
307
316
|
if (
|
|
308
317
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
318
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
319
|
):
|
|
320
|
+
if _DATAHUB_EMITTER_TRACE:
|
|
321
|
+
logger.debug("Decided to create new chunk")
|
|
311
322
|
mcp_obj_chunks.append([])
|
|
312
323
|
current_chunk_size = 0
|
|
313
324
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
314
325
|
current_chunk_size += mcp_obj_size
|
|
326
|
+
if len(mcp_obj_chunks) > 0:
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
|
|
329
|
+
)
|
|
315
330
|
|
|
316
331
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
317
332
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
@@ -338,8 +353,15 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
338
353
|
|
|
339
354
|
def _emit_generic(self, url: str, payload: str) -> None:
|
|
340
355
|
curl_command = make_curl_command(self._session, "POST", url, payload)
|
|
356
|
+
payload_size = len(payload)
|
|
357
|
+
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
358
|
+
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
359
|
+
logger.warning(
|
|
360
|
+
f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
|
|
361
|
+
)
|
|
341
362
|
logger.debug(
|
|
342
|
-
"Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
|
|
363
|
+
"Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
|
|
364
|
+
payload_size,
|
|
343
365
|
curl_command,
|
|
344
366
|
)
|
|
345
367
|
try:
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, List
|
|
4
|
+
|
|
5
|
+
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
|
+
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
8
|
+
from datahub.metadata.schema_classes import (
|
|
9
|
+
DatasetProfileClass,
|
|
10
|
+
SchemaFieldClass,
|
|
11
|
+
SchemaMetadataClass,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datahub.ingestion.api.source import SourceReport
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EnsureAspectSizeProcessor:
|
|
21
|
+
def __init__(
|
|
22
|
+
self, report: "SourceReport", payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
23
|
+
):
|
|
24
|
+
self.report = report
|
|
25
|
+
self.payload_constraint = payload_constraint
|
|
26
|
+
|
|
27
|
+
def ensure_dataset_profile_size(
|
|
28
|
+
self, dataset_urn: str, profile: DatasetProfileClass
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
|
|
32
|
+
in the future
|
|
33
|
+
"""
|
|
34
|
+
sample_fields_size = 0
|
|
35
|
+
if profile.fieldProfiles:
|
|
36
|
+
logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
|
|
37
|
+
for field in profile.fieldProfiles:
|
|
38
|
+
if field.sampleValues:
|
|
39
|
+
values_len = 0
|
|
40
|
+
for value in field.sampleValues:
|
|
41
|
+
if value:
|
|
42
|
+
values_len += len(value)
|
|
43
|
+
logger.debug(
|
|
44
|
+
f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
|
|
45
|
+
)
|
|
46
|
+
if sample_fields_size + values_len > self.payload_constraint:
|
|
47
|
+
field.sampleValues = []
|
|
48
|
+
self.report.warning(
|
|
49
|
+
title="Dataset profile truncated due to size constraint",
|
|
50
|
+
message="Dataset profile contained too much data and would have caused ingestion to fail",
|
|
51
|
+
context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
sample_fields_size += values_len
|
|
55
|
+
else:
|
|
56
|
+
logger.debug(f"Field {field.fieldPath} has no sample values")
|
|
57
|
+
|
|
58
|
+
def ensure_schema_metadata_size(
|
|
59
|
+
self, dataset_urn: str, schema: SchemaMetadataClass
|
|
60
|
+
) -> None:
|
|
61
|
+
"""
|
|
62
|
+
This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
|
|
63
|
+
in the future
|
|
64
|
+
"""
|
|
65
|
+
total_fields_size = 0
|
|
66
|
+
logger.debug(f"Amount of schema fields: {len(schema.fields)}")
|
|
67
|
+
accepted_fields: List[SchemaFieldClass] = []
|
|
68
|
+
for field in schema.fields:
|
|
69
|
+
field_size = len(json.dumps(pre_json_transform(field.to_obj())))
|
|
70
|
+
logger.debug(f"Field {field.fieldPath} takes total {field_size}")
|
|
71
|
+
if total_fields_size + field_size < self.payload_constraint:
|
|
72
|
+
accepted_fields.append(field)
|
|
73
|
+
total_fields_size += field_size
|
|
74
|
+
else:
|
|
75
|
+
self.report.warning(
|
|
76
|
+
title="Schema truncated due to size constraint",
|
|
77
|
+
message="Dataset schema contained too much data and would have caused ingestion to fail",
|
|
78
|
+
context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
schema.fields = accepted_fields
|
|
82
|
+
|
|
83
|
+
def ensure_aspect_size(
|
|
84
|
+
self,
|
|
85
|
+
stream: Iterable[MetadataWorkUnit],
|
|
86
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
87
|
+
"""
|
|
88
|
+
We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
|
|
89
|
+
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
|
|
90
|
+
"""
|
|
91
|
+
for wu in stream:
|
|
92
|
+
logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
93
|
+
|
|
94
|
+
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
|
|
95
|
+
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
|
96
|
+
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
|
97
|
+
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
|
98
|
+
yield wu
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from contextlib import AbstractContextManager
|
|
3
3
|
from types import TracebackType
|
|
4
|
-
from typing import Optional, Type
|
|
4
|
+
from typing import Optional, Type
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
from typing_extensions import Self
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Closeable(AbstractContextManager):
|
|
@@ -11,7 +11,7 @@ class Closeable(AbstractContextManager):
|
|
|
11
11
|
def close(self) -> None:
|
|
12
12
|
pass
|
|
13
13
|
|
|
14
|
-
def __enter__(self
|
|
14
|
+
def __enter__(self) -> Self:
|
|
15
15
|
# This method is mainly required for type checking.
|
|
16
16
|
return self
|
|
17
17
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any, Dict, NewType, Optional
|
|
3
|
+
from typing import Any, Dict, NewType, Optional
|
|
4
|
+
|
|
5
|
+
from typing_extensions import Self
|
|
4
6
|
|
|
5
7
|
import datahub.emitter.mce_builder as builder
|
|
6
8
|
from datahub.configuration.common import ConfigModel
|
|
@@ -17,9 +19,6 @@ class IngestionCheckpointingProviderConfig(ConfigModel):
|
|
|
17
19
|
pass
|
|
18
20
|
|
|
19
21
|
|
|
20
|
-
_Self = TypeVar("_Self", bound="IngestionCheckpointingProviderBase")
|
|
21
|
-
|
|
22
|
-
|
|
23
22
|
@dataclass()
|
|
24
23
|
class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStatesMap]):
|
|
25
24
|
"""
|
|
@@ -32,9 +31,7 @@ class IngestionCheckpointingProviderBase(StatefulCommittable[CheckpointJobStates
|
|
|
32
31
|
|
|
33
32
|
@classmethod
|
|
34
33
|
@abstractmethod
|
|
35
|
-
def create(
|
|
36
|
-
cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext
|
|
37
|
-
) -> "_Self":
|
|
34
|
+
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> Self:
|
|
38
35
|
pass
|
|
39
36
|
|
|
40
37
|
@abstractmethod
|
datahub/ingestion/api/report.py
CHANGED
|
@@ -42,7 +42,10 @@ class Report(SupportsAsObj):
|
|
|
42
42
|
return some_val.as_obj()
|
|
43
43
|
elif isinstance(some_val, pydantic.BaseModel):
|
|
44
44
|
return Report.to_pure_python_obj(some_val.dict())
|
|
45
|
-
elif dataclasses.is_dataclass(some_val):
|
|
45
|
+
elif dataclasses.is_dataclass(some_val) and not isinstance(some_val, type):
|
|
46
|
+
# The `is_dataclass` function returns `True` for both instances and classes.
|
|
47
|
+
# We need an extra check to ensure an instance was passed in.
|
|
48
|
+
# https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass
|
|
46
49
|
return dataclasses.asdict(some_val)
|
|
47
50
|
elif isinstance(some_val, list):
|
|
48
51
|
return [Report.to_pure_python_obj(v) for v in some_val if v is not None]
|
datahub/ingestion/api/sink.py
CHANGED
|
@@ -3,6 +3,8 @@ from abc import ABCMeta, abstractmethod
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Any, Generic, Optional, Type, TypeVar, cast
|
|
5
5
|
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
6
8
|
from datahub.configuration.common import ConfigModel
|
|
7
9
|
from datahub.ingestion.api.closeable import Closeable
|
|
8
10
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
@@ -79,7 +81,6 @@ class NoopWriteCallback(WriteCallback):
|
|
|
79
81
|
|
|
80
82
|
SinkReportType = TypeVar("SinkReportType", bound=SinkReport, covariant=True)
|
|
81
83
|
SinkConfig = TypeVar("SinkConfig", bound=ConfigModel, covariant=True)
|
|
82
|
-
Self = TypeVar("Self", bound="Sink")
|
|
83
84
|
|
|
84
85
|
|
|
85
86
|
class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
@@ -90,7 +91,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
90
91
|
report: SinkReportType
|
|
91
92
|
|
|
92
93
|
@classmethod
|
|
93
|
-
def get_config_class(cls
|
|
94
|
+
def get_config_class(cls) -> Type[SinkConfig]:
|
|
94
95
|
config_class = get_class_from_annotation(cls, Sink, ConfigModel)
|
|
95
96
|
assert config_class, "Sink subclasses must define a config class"
|
|
96
97
|
return cast(Type[SinkConfig], config_class)
|
|
@@ -112,7 +113,7 @@ class Sink(Generic[SinkConfig, SinkReportType], Closeable, metaclass=ABCMeta):
|
|
|
112
113
|
pass
|
|
113
114
|
|
|
114
115
|
@classmethod
|
|
115
|
-
def create(cls
|
|
116
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Self":
|
|
116
117
|
return cls(ctx, cls.get_config_class().parse_obj(config_dict))
|
|
117
118
|
|
|
118
119
|
def handle_work_unit_start(self, workunit: WorkUnit) -> None:
|
datahub/ingestion/api/source.py
CHANGED
|
@@ -31,6 +31,9 @@ from datahub.emitter.mcp_builder import mcps_from_mce
|
|
|
31
31
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
32
32
|
auto_patch_last_modified,
|
|
33
33
|
)
|
|
34
|
+
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
35
|
+
EnsureAspectSizeProcessor,
|
|
36
|
+
)
|
|
34
37
|
from datahub.ingestion.api.closeable import Closeable
|
|
35
38
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
|
36
39
|
from datahub.ingestion.api.report import Report
|
|
@@ -450,6 +453,7 @@ class Source(Closeable, metaclass=ABCMeta):
|
|
|
450
453
|
browse_path_processor,
|
|
451
454
|
partial(auto_workunit_reporter, self.get_report()),
|
|
452
455
|
auto_patch_last_modified,
|
|
456
|
+
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
453
457
|
]
|
|
454
458
|
|
|
455
459
|
@staticmethod
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from datetime import datetime, timezone
|
|
3
2
|
from typing import (
|
|
4
3
|
TYPE_CHECKING,
|
|
5
4
|
Dict,
|
|
@@ -14,7 +13,7 @@ from typing import (
|
|
|
14
13
|
)
|
|
15
14
|
|
|
16
15
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
17
|
-
from datahub.emitter.mce_builder import make_dataplatform_instance_urn
|
|
16
|
+
from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
|
|
18
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
18
|
from datahub.emitter.mcp_builder import entity_supports_aspect
|
|
20
19
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -479,10 +478,7 @@ def auto_empty_dataset_usage_statistics(
|
|
|
479
478
|
if invalid_timestamps:
|
|
480
479
|
logger.warning(
|
|
481
480
|
f"Usage statistics with unexpected timestamps, bucket_duration={config.bucket_duration}:\n"
|
|
482
|
-
", ".join(
|
|
483
|
-
str(datetime.fromtimestamp(ts / 1000, tz=timezone.utc))
|
|
484
|
-
for ts in invalid_timestamps
|
|
485
|
-
)
|
|
481
|
+
", ".join(str(parse_ts_millis(ts)) for ts in invalid_timestamps)
|
|
486
482
|
)
|
|
487
483
|
|
|
488
484
|
for bucket in bucket_timestamps:
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
1
|
from abc import ABCMeta, abstractmethod
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from typing import Any, Dict, List, Optional
|
|
@@ -38,8 +37,8 @@ class ClassificationConfig(ConfigModel):
|
|
|
38
37
|
)
|
|
39
38
|
|
|
40
39
|
max_workers: int = Field(
|
|
41
|
-
default=
|
|
42
|
-
description="Number of worker processes to use for classification. Set to 1 to disable.",
|
|
40
|
+
default=1,
|
|
41
|
+
description="Number of worker processes to use for classification. Note that any number above 1 might lead to a deadlock. Set to 1 to disable.",
|
|
43
42
|
)
|
|
44
43
|
|
|
45
44
|
table_pattern: AllowDenyPattern = Field(
|
|
@@ -188,9 +188,12 @@ class DataHubGraph(DatahubRestEmitter):
|
|
|
188
188
|
retry_max_times=emitter._retry_max_times,
|
|
189
189
|
extra_headers=emitter._session.headers,
|
|
190
190
|
disable_ssl_verification=emitter._session.verify is False,
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
191
|
+
ca_certificate_path=(
|
|
192
|
+
emitter._session.verify
|
|
193
|
+
if isinstance(emitter._session.verify, str)
|
|
194
|
+
else None
|
|
195
|
+
),
|
|
196
|
+
client_certificate_path=emitter._session.cert,
|
|
194
197
|
)
|
|
195
198
|
)
|
|
196
199
|
|