acryl-datahub 1.3.0.1rc2__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2469 -2467
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +50 -48
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/metadata/_internal_schema_classes.py +544 -544
- datahub/metadata/_urns/urn_defs.py +1728 -1728
- datahub/metadata/schema.avsc +15157 -15157
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc2.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,6 @@ import enum
|
|
|
4
4
|
import functools
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import os
|
|
8
7
|
import pathlib
|
|
9
8
|
import tempfile
|
|
10
9
|
import uuid
|
|
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
|
|
|
14
13
|
|
|
15
14
|
import datahub.emitter.mce_builder as builder
|
|
16
15
|
import datahub.metadata.schema_classes as models
|
|
16
|
+
from datahub.configuration.env_vars import get_sql_agg_query_log
|
|
17
17
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
18
18
|
from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
|
-
from datahub.emitter.sql_parsing_builder import compute_upstream_fields
|
|
21
20
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
21
|
from datahub.ingestion.api.report import Report
|
|
23
22
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -84,7 +83,7 @@ class QueryLogSetting(enum.Enum):
|
|
|
84
83
|
_DEFAULT_USER_URN = CorpUserUrn("_ingestion")
|
|
85
84
|
_MISSING_SESSION_ID = "__MISSING_SESSION_ID"
|
|
86
85
|
_DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
|
|
87
|
-
|
|
86
|
+
get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
|
|
88
87
|
]
|
|
89
88
|
MAX_UPSTREAM_TABLES_COUNT = 300
|
|
90
89
|
MAX_FINEGRAINEDLINEAGE_COUNT = 2000
|
|
@@ -868,7 +867,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
868
867
|
downstream=parsed.out_tables[0] if parsed.out_tables else None,
|
|
869
868
|
column_lineage=parsed.column_lineage,
|
|
870
869
|
# TODO: We need a full list of columns referenced, not just the out tables.
|
|
871
|
-
column_usage=
|
|
870
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
872
871
|
inferred_schema=infer_output_schema(parsed),
|
|
873
872
|
confidence_score=parsed.debug_info.confidence,
|
|
874
873
|
extra_info=observed.extra_info,
|
|
@@ -1157,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1157
1156
|
actor=None,
|
|
1158
1157
|
upstreams=parsed.in_tables,
|
|
1159
1158
|
column_lineage=parsed.column_lineage or [],
|
|
1160
|
-
column_usage=
|
|
1159
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
1161
1160
|
confidence_score=parsed.debug_info.confidence,
|
|
1162
1161
|
)
|
|
1163
1162
|
)
|
|
@@ -1741,6 +1740,16 @@ class SqlParsingAggregator(Closeable):
|
|
|
1741
1740
|
|
|
1742
1741
|
return resolved_query
|
|
1743
1742
|
|
|
1743
|
+
@staticmethod
|
|
1744
|
+
def _compute_upstream_fields(
|
|
1745
|
+
result: SqlParsingResult,
|
|
1746
|
+
) -> Dict[UrnStr, Set[UrnStr]]:
|
|
1747
|
+
upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
|
|
1748
|
+
for cl in result.column_lineage or []:
|
|
1749
|
+
for upstream in cl.upstreams:
|
|
1750
|
+
upstream_fields[upstream.table].add(upstream.column)
|
|
1751
|
+
return upstream_fields
|
|
1752
|
+
|
|
1744
1753
|
def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1745
1754
|
if not self._usage_aggregator:
|
|
1746
1755
|
return
|
|
@@ -691,6 +691,13 @@ def _column_level_lineage(
|
|
|
691
691
|
select_statement=select_statement,
|
|
692
692
|
)
|
|
693
693
|
|
|
694
|
+
# Handle VALUES expressions separately - they have no upstream tables and no column lineage
|
|
695
|
+
if isinstance(select_statement, sqlglot.exp.Values):
|
|
696
|
+
return _ColumnLineageWithDebugInfo(
|
|
697
|
+
column_lineage=[],
|
|
698
|
+
select_statement=select_statement,
|
|
699
|
+
)
|
|
700
|
+
|
|
694
701
|
assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
|
|
695
702
|
try:
|
|
696
703
|
root_scope = sqlglot.optimizer.build_scope(select_statement)
|
datahub/telemetry/telemetry.py
CHANGED
|
@@ -16,6 +16,11 @@ from datahub._version import __version__, nice_version_name
|
|
|
16
16
|
from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
|
|
17
17
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
18
18
|
from datahub.configuration.common import ExceptionWithProps
|
|
19
|
+
from datahub.configuration.env_vars import (
|
|
20
|
+
get_sentry_dsn,
|
|
21
|
+
get_sentry_environment,
|
|
22
|
+
get_telemetry_timeout,
|
|
23
|
+
)
|
|
19
24
|
from datahub.metadata.schema_classes import _custom_package_path
|
|
20
25
|
from datahub.utilities.perf_timer import PerfTimer
|
|
21
26
|
|
|
@@ -97,11 +102,11 @@ if any(var in os.environ for var in CI_ENV_VARS):
|
|
|
97
102
|
if _custom_package_path:
|
|
98
103
|
ENV_ENABLED = False
|
|
99
104
|
|
|
100
|
-
TIMEOUT = int(
|
|
105
|
+
TIMEOUT = int(get_telemetry_timeout())
|
|
101
106
|
MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
|
|
102
107
|
MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
|
|
103
|
-
SENTRY_DSN: Optional[str] =
|
|
104
|
-
SENTRY_ENVIRONMENT: str =
|
|
108
|
+
SENTRY_DSN: Optional[str] = get_sentry_dsn()
|
|
109
|
+
SENTRY_ENVIRONMENT: str = get_sentry_environment()
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
def _default_global_properties() -> Dict[str, Any]:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import gzip
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
import pathlib
|
|
6
5
|
import pickle
|
|
7
6
|
import shutil
|
|
@@ -28,6 +27,7 @@ from typing import (
|
|
|
28
27
|
Union,
|
|
29
28
|
)
|
|
30
29
|
|
|
30
|
+
from datahub.configuration.env_vars import get_override_sqlite_version_req
|
|
31
31
|
from datahub.ingestion.api.closeable import Closeable
|
|
32
32
|
from datahub.utilities.sentinels import Unset, unset
|
|
33
33
|
|
|
@@ -36,7 +36,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
36
36
|
|
|
37
37
|
def _get_sqlite_version_override() -> bool:
|
|
38
38
|
"""Check if SQLite version requirement should be overridden at runtime."""
|
|
39
|
-
override_str =
|
|
39
|
+
override_str = get_override_sqlite_version_req()
|
|
40
40
|
return bool(override_str and override_str.lower() != "false")
|
|
41
41
|
|
|
42
42
|
|
datahub/utilities/is_pytest.py
CHANGED
|
@@ -15,13 +15,13 @@ import collections
|
|
|
15
15
|
import contextlib
|
|
16
16
|
import itertools
|
|
17
17
|
import logging
|
|
18
|
-
import os
|
|
19
18
|
import pathlib
|
|
20
19
|
import sys
|
|
21
20
|
from typing import Deque, Iterator, Optional
|
|
22
21
|
|
|
23
22
|
import click
|
|
24
23
|
|
|
24
|
+
from datahub.configuration.env_vars import get_no_color, get_suppress_logging_manager
|
|
25
25
|
from datahub.utilities.tee_io import TeeIO
|
|
26
26
|
|
|
27
27
|
BASE_LOGGING_FORMAT = (
|
|
@@ -38,7 +38,7 @@ IN_MEMORY_LOG_BUFFER_SIZE = 2000 # lines
|
|
|
38
38
|
IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH = 2000 # characters
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
NO_COLOR =
|
|
41
|
+
NO_COLOR = get_no_color()
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def extract_name_from_filename(filename: str, fallback_name: str) -> str:
|
|
@@ -179,6 +179,18 @@ class _LogBuffer:
|
|
|
179
179
|
return text
|
|
180
180
|
|
|
181
181
|
|
|
182
|
+
class _ResilientStreamHandler(logging.StreamHandler):
|
|
183
|
+
"""StreamHandler that gracefully handles closed streams."""
|
|
184
|
+
|
|
185
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
186
|
+
try:
|
|
187
|
+
super().emit(record)
|
|
188
|
+
except (ValueError, OSError):
|
|
189
|
+
# Stream was closed (e.g., during pytest teardown)
|
|
190
|
+
# Silently ignore to prevent test failures
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
182
194
|
class _BufferLogHandler(logging.Handler):
|
|
183
195
|
def __init__(self, storage: _LogBuffer) -> None:
|
|
184
196
|
super().__init__()
|
|
@@ -201,7 +213,11 @@ class _BufferLogHandler(logging.Handler):
|
|
|
201
213
|
def _remove_all_handlers(logger: logging.Logger) -> None:
|
|
202
214
|
for handler in logger.handlers[:]:
|
|
203
215
|
logger.removeHandler(handler)
|
|
204
|
-
|
|
216
|
+
try:
|
|
217
|
+
handler.close()
|
|
218
|
+
except (ValueError, OSError):
|
|
219
|
+
# Handler stream may already be closed (e.g., during pytest teardown)
|
|
220
|
+
pass
|
|
205
221
|
|
|
206
222
|
|
|
207
223
|
_log_buffer = _LogBuffer(maxlen=IN_MEMORY_LOG_BUFFER_SIZE)
|
|
@@ -219,14 +235,14 @@ _default_formatter = logging.Formatter(BASE_LOGGING_FORMAT)
|
|
|
219
235
|
def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[None]:
|
|
220
236
|
_log_buffer.clear()
|
|
221
237
|
|
|
222
|
-
if
|
|
238
|
+
if get_suppress_logging_manager() == "1":
|
|
223
239
|
# If we're running in pytest, we don't want to configure logging.
|
|
224
240
|
yield
|
|
225
241
|
return
|
|
226
242
|
|
|
227
243
|
with contextlib.ExitStack() as stack:
|
|
228
244
|
# Create stdout handler.
|
|
229
|
-
stream_handler =
|
|
245
|
+
stream_handler = _ResilientStreamHandler()
|
|
230
246
|
stream_handler.addFilter(_DatahubLogFilter(debug=debug))
|
|
231
247
|
stream_handler.setFormatter(_stream_formatter)
|
|
232
248
|
|
|
@@ -237,7 +253,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
|
|
|
237
253
|
tee = TeeIO(sys.stdout, file)
|
|
238
254
|
stack.enter_context(contextlib.redirect_stdout(tee)) # type: ignore
|
|
239
255
|
|
|
240
|
-
file_handler =
|
|
256
|
+
file_handler = _ResilientStreamHandler(file)
|
|
241
257
|
file_handler.addFilter(_DatahubLogFilter(debug=True))
|
|
242
258
|
file_handler.setFormatter(_default_formatter)
|
|
243
259
|
else:
|
datahub/utilities/sample_data.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pathlib
|
|
3
2
|
import tempfile
|
|
4
3
|
|
|
5
4
|
import requests
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from datahub.configuration.env_vars import get_docker_compose_base
|
|
7
|
+
|
|
8
|
+
DOCKER_COMPOSE_BASE = (
|
|
9
|
+
get_docker_compose_base()
|
|
10
|
+
or "https://raw.githubusercontent.com/datahub-project/datahub/master"
|
|
10
11
|
)
|
|
11
12
|
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
|
|
12
13
|
BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"
|
|
@@ -1,306 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import time
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
from typing import Collection, Dict, Iterable, List, Optional, Set
|
|
7
|
-
|
|
8
|
-
from datahub.emitter.mce_builder import make_schema_field_urn
|
|
9
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
10
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
11
|
-
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator
|
|
12
|
-
from datahub.metadata.schema_classes import (
|
|
13
|
-
AuditStampClass,
|
|
14
|
-
DatasetLineageTypeClass,
|
|
15
|
-
FineGrainedLineageClass,
|
|
16
|
-
FineGrainedLineageDownstreamTypeClass,
|
|
17
|
-
FineGrainedLineageUpstreamTypeClass,
|
|
18
|
-
OperationClass,
|
|
19
|
-
OperationTypeClass,
|
|
20
|
-
UpstreamClass,
|
|
21
|
-
UpstreamLineageClass,
|
|
22
|
-
)
|
|
23
|
-
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
|
|
24
|
-
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
25
|
-
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
# TODO: Use this over other sources' equivalent code, if possible
|
|
29
|
-
|
|
30
|
-
DatasetUrn = str
|
|
31
|
-
FieldUrn = str
|
|
32
|
-
UserUrn = str
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class LineageEdge:
|
|
37
|
-
"""Stores information about a single lineage edge, from an upstream table to a downstream table."""
|
|
38
|
-
|
|
39
|
-
downstream_urn: DatasetUrn
|
|
40
|
-
upstream_urn: DatasetUrn
|
|
41
|
-
audit_stamp: Optional[datetime]
|
|
42
|
-
actor: Optional[UserUrn]
|
|
43
|
-
type: str = DatasetLineageTypeClass.TRANSFORMED
|
|
44
|
-
|
|
45
|
-
# Maps downstream_col -> {upstream_col}
|
|
46
|
-
column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set))
|
|
47
|
-
|
|
48
|
-
def gen_upstream_aspect(self) -> UpstreamClass:
|
|
49
|
-
return UpstreamClass(
|
|
50
|
-
auditStamp=(
|
|
51
|
-
AuditStampClass(
|
|
52
|
-
time=int(self.audit_stamp.timestamp() * 1000),
|
|
53
|
-
actor=self.actor or "",
|
|
54
|
-
)
|
|
55
|
-
if self.audit_stamp
|
|
56
|
-
else None
|
|
57
|
-
),
|
|
58
|
-
dataset=self.upstream_urn,
|
|
59
|
-
type=self.type,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]:
|
|
63
|
-
for downstream_col, upstream_cols in self.column_map.items():
|
|
64
|
-
yield FineGrainedLineageClass(
|
|
65
|
-
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
66
|
-
# Sort to avoid creating multiple aspects in backend with same lineage but different order
|
|
67
|
-
upstreams=sorted(
|
|
68
|
-
make_schema_field_urn(self.upstream_urn, col)
|
|
69
|
-
for col in upstream_cols
|
|
70
|
-
),
|
|
71
|
-
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
72
|
-
downstreams=[
|
|
73
|
-
make_schema_field_urn(self.downstream_urn, downstream_col)
|
|
74
|
-
],
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
@dataclass
|
|
79
|
-
class SqlParsingBuilder:
|
|
80
|
-
# Open question: does it make sense to iterate over out_tables? When will we have multiple?
|
|
81
|
-
|
|
82
|
-
generate_lineage: bool = True
|
|
83
|
-
generate_usage_statistics: bool = True
|
|
84
|
-
generate_operations: bool = True
|
|
85
|
-
usage_config: Optional[BaseUsageConfig] = None
|
|
86
|
-
|
|
87
|
-
# Maps downstream urn -> upstream urn -> LineageEdge
|
|
88
|
-
# Builds up a single LineageEdge for each upstream -> downstream pair
|
|
89
|
-
_lineage_map: FileBackedDict[Dict[DatasetUrn, LineageEdge]] = field(
|
|
90
|
-
default_factory=FileBackedDict, init=False
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
# TODO: Replace with FileBackedDict approach like in BigQuery usage
|
|
94
|
-
_usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False)
|
|
95
|
-
|
|
96
|
-
def __post_init__(self) -> None:
|
|
97
|
-
if self.usage_config:
|
|
98
|
-
self._usage_aggregator = UsageAggregator(self.usage_config)
|
|
99
|
-
elif self.generate_usage_statistics:
|
|
100
|
-
logger.info("No usage config provided, not generating usage statistics")
|
|
101
|
-
self.generate_usage_statistics = False
|
|
102
|
-
|
|
103
|
-
def process_sql_parsing_result(
|
|
104
|
-
self,
|
|
105
|
-
result: SqlParsingResult,
|
|
106
|
-
*,
|
|
107
|
-
query: str,
|
|
108
|
-
query_timestamp: Optional[datetime] = None,
|
|
109
|
-
is_view_ddl: bool = False,
|
|
110
|
-
user: Optional[UserUrn] = None,
|
|
111
|
-
custom_operation_type: Optional[str] = None,
|
|
112
|
-
include_urns: Optional[Set[DatasetUrn]] = None,
|
|
113
|
-
include_column_lineage: bool = True,
|
|
114
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
115
|
-
"""Process a single query and yield any generated workunits.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
result: The result of parsing the query, or a mock result if parsing failed.
|
|
119
|
-
query: The SQL query to parse and process.
|
|
120
|
-
query_timestamp: When the query was run.
|
|
121
|
-
is_view_ddl: Whether the query is a DDL statement that creates a view.
|
|
122
|
-
user: The urn of the user who ran the query.
|
|
123
|
-
custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed.
|
|
124
|
-
include_urns: If provided, only generate workunits for these urns.
|
|
125
|
-
"""
|
|
126
|
-
downstreams_to_ingest = result.out_tables
|
|
127
|
-
upstreams_to_ingest = result.in_tables
|
|
128
|
-
if include_urns:
|
|
129
|
-
logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}")
|
|
130
|
-
downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns)
|
|
131
|
-
upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns)
|
|
132
|
-
|
|
133
|
-
if self.generate_lineage:
|
|
134
|
-
for downstream_urn in downstreams_to_ingest:
|
|
135
|
-
# Set explicitly so that FileBackedDict registers any mutations
|
|
136
|
-
self._lineage_map[downstream_urn] = _merge_lineage_data(
|
|
137
|
-
downstream_urn=downstream_urn,
|
|
138
|
-
upstream_urns=result.in_tables,
|
|
139
|
-
column_lineage=(
|
|
140
|
-
result.column_lineage if include_column_lineage else None
|
|
141
|
-
),
|
|
142
|
-
upstream_edges=self._lineage_map.get(downstream_urn, {}),
|
|
143
|
-
query_timestamp=query_timestamp,
|
|
144
|
-
is_view_ddl=is_view_ddl,
|
|
145
|
-
user=user,
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
if self.generate_usage_statistics and query_timestamp is not None:
|
|
149
|
-
upstream_fields = compute_upstream_fields(result)
|
|
150
|
-
for upstream_urn in upstreams_to_ingest:
|
|
151
|
-
self._usage_aggregator.aggregate_event(
|
|
152
|
-
resource=upstream_urn,
|
|
153
|
-
start_time=query_timestamp,
|
|
154
|
-
query=query,
|
|
155
|
-
user=user,
|
|
156
|
-
fields=sorted(upstream_fields.get(upstream_urn, [])),
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
if self.generate_operations and query_timestamp is not None:
|
|
160
|
-
for downstream_urn in downstreams_to_ingest:
|
|
161
|
-
yield from _gen_operation_workunit(
|
|
162
|
-
result,
|
|
163
|
-
downstream_urn=downstream_urn,
|
|
164
|
-
query_timestamp=query_timestamp,
|
|
165
|
-
user=user,
|
|
166
|
-
custom_operation_type=custom_operation_type,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
def add_lineage(
|
|
170
|
-
self,
|
|
171
|
-
downstream_urn: DatasetUrn,
|
|
172
|
-
upstream_urns: Collection[DatasetUrn],
|
|
173
|
-
timestamp: Optional[datetime] = None,
|
|
174
|
-
is_view_ddl: bool = False,
|
|
175
|
-
user: Optional[UserUrn] = None,
|
|
176
|
-
) -> None:
|
|
177
|
-
"""Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails."""
|
|
178
|
-
# Set explicitly so that FileBackedDict registers any mutations
|
|
179
|
-
self._lineage_map[downstream_urn] = _merge_lineage_data(
|
|
180
|
-
downstream_urn=downstream_urn,
|
|
181
|
-
upstream_urns=upstream_urns,
|
|
182
|
-
column_lineage=None,
|
|
183
|
-
upstream_edges=self._lineage_map.get(downstream_urn, {}),
|
|
184
|
-
query_timestamp=timestamp,
|
|
185
|
-
is_view_ddl=is_view_ddl,
|
|
186
|
-
user=user,
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
def gen_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
190
|
-
if self.generate_lineage:
|
|
191
|
-
for mcp in self._gen_lineage_mcps():
|
|
192
|
-
yield mcp.as_workunit()
|
|
193
|
-
if self.generate_usage_statistics:
|
|
194
|
-
yield from self._gen_usage_statistics_workunits()
|
|
195
|
-
|
|
196
|
-
def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
197
|
-
for downstream_urn in self._lineage_map:
|
|
198
|
-
upstreams: List[UpstreamClass] = []
|
|
199
|
-
fine_upstreams: List[FineGrainedLineageClass] = []
|
|
200
|
-
for edge in self._lineage_map[downstream_urn].values():
|
|
201
|
-
upstreams.append(edge.gen_upstream_aspect())
|
|
202
|
-
fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects())
|
|
203
|
-
|
|
204
|
-
if not upstreams:
|
|
205
|
-
continue
|
|
206
|
-
|
|
207
|
-
upstream_lineage = UpstreamLineageClass(
|
|
208
|
-
upstreams=sorted(upstreams, key=lambda x: x.dataset),
|
|
209
|
-
fineGrainedLineages=sorted(
|
|
210
|
-
fine_upstreams,
|
|
211
|
-
key=lambda x: (x.downstreams, x.upstreams),
|
|
212
|
-
)
|
|
213
|
-
or None,
|
|
214
|
-
)
|
|
215
|
-
yield MetadataChangeProposalWrapper(
|
|
216
|
-
entityUrn=downstream_urn, aspect=upstream_lineage
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
220
|
-
yield from self._usage_aggregator.generate_workunits(
|
|
221
|
-
resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def _merge_lineage_data(
|
|
226
|
-
downstream_urn: DatasetUrn,
|
|
227
|
-
*,
|
|
228
|
-
upstream_urns: Collection[DatasetUrn],
|
|
229
|
-
column_lineage: Optional[List[ColumnLineageInfo]],
|
|
230
|
-
upstream_edges: Dict[DatasetUrn, LineageEdge],
|
|
231
|
-
query_timestamp: Optional[datetime],
|
|
232
|
-
is_view_ddl: bool,
|
|
233
|
-
user: Optional[UserUrn],
|
|
234
|
-
) -> Dict[str, LineageEdge]:
|
|
235
|
-
for upstream_urn in upstream_urns:
|
|
236
|
-
edge = upstream_edges.setdefault(
|
|
237
|
-
upstream_urn,
|
|
238
|
-
LineageEdge(
|
|
239
|
-
downstream_urn=downstream_urn,
|
|
240
|
-
upstream_urn=upstream_urn,
|
|
241
|
-
audit_stamp=query_timestamp,
|
|
242
|
-
actor=user,
|
|
243
|
-
type=(
|
|
244
|
-
DatasetLineageTypeClass.VIEW
|
|
245
|
-
if is_view_ddl
|
|
246
|
-
else DatasetLineageTypeClass.TRANSFORMED
|
|
247
|
-
),
|
|
248
|
-
),
|
|
249
|
-
)
|
|
250
|
-
if query_timestamp and ( # Use the most recent query
|
|
251
|
-
edge.audit_stamp is None or query_timestamp > edge.audit_stamp
|
|
252
|
-
):
|
|
253
|
-
edge.audit_stamp = query_timestamp
|
|
254
|
-
if user:
|
|
255
|
-
edge.actor = user
|
|
256
|
-
|
|
257
|
-
# Note: Inefficient as we loop through all column_lineage entries for each downstream table
|
|
258
|
-
for cl in column_lineage or []:
|
|
259
|
-
if cl.downstream.table == downstream_urn:
|
|
260
|
-
for upstream_column_info in cl.upstreams:
|
|
261
|
-
if upstream_column_info.table not in upstream_urns:
|
|
262
|
-
continue
|
|
263
|
-
column_map = upstream_edges[upstream_column_info.table].column_map
|
|
264
|
-
column_map[cl.downstream.column].add(upstream_column_info.column)
|
|
265
|
-
|
|
266
|
-
return upstream_edges
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def compute_upstream_fields(
|
|
270
|
-
result: SqlParsingResult,
|
|
271
|
-
) -> Dict[DatasetUrn, Set[DatasetUrn]]:
|
|
272
|
-
upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set)
|
|
273
|
-
for cl in result.column_lineage or []:
|
|
274
|
-
for upstream in cl.upstreams:
|
|
275
|
-
upstream_fields[upstream.table].add(upstream.column)
|
|
276
|
-
return upstream_fields
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
def _gen_operation_workunit(
|
|
280
|
-
result: SqlParsingResult,
|
|
281
|
-
*,
|
|
282
|
-
downstream_urn: DatasetUrn,
|
|
283
|
-
query_timestamp: datetime,
|
|
284
|
-
user: Optional[UserUrn],
|
|
285
|
-
custom_operation_type: Optional[str],
|
|
286
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
287
|
-
operation_type = result.query_type.to_operation_type()
|
|
288
|
-
# Filter out SELECT and other undesired statements
|
|
289
|
-
if operation_type is None:
|
|
290
|
-
return
|
|
291
|
-
elif operation_type == OperationTypeClass.UNKNOWN:
|
|
292
|
-
if custom_operation_type is None:
|
|
293
|
-
return
|
|
294
|
-
else:
|
|
295
|
-
operation_type = OperationTypeClass.CUSTOM
|
|
296
|
-
|
|
297
|
-
aspect = OperationClass(
|
|
298
|
-
timestampMillis=int(time.time() * 1000),
|
|
299
|
-
operationType=operation_type,
|
|
300
|
-
lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000),
|
|
301
|
-
actor=user,
|
|
302
|
-
customOperationType=custom_operation_type,
|
|
303
|
-
)
|
|
304
|
-
yield MetadataChangeProposalWrapper(
|
|
305
|
-
entityUrn=downstream_urn, aspect=aspect
|
|
306
|
-
).as_workunit()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|