acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2557 -2557
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +34 -30
- datahub/__init__.py +1 -1
- datahub/configuration/kafka_consumer_config.py +4 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
- datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
- datahub/ingestion/source/ge_data_profiler.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +6 -2
- datahub/ingestion/source/redshift/report.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
- datahub/ingestion/source/sql/mssql/job_models.py +1 -0
- datahub/ingestion/source/sql/mssql/source.py +113 -38
- datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
- datahub/ingestion/source/sql/oracle.py +50 -0
- datahub/ingestion/source/sql/sql_common.py +28 -54
- datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
- datahub/ingestion/source/sql/sql_report.py +75 -0
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/sql/vertica.py +2 -2
- datahub/ingestion/source/unity/report.py +2 -2
- datahub/metadata/schema.avsc +1 -1
- datahub/metadata/schemas/AssertionInfo.avsc +1 -1
- datahub/metadata/schemas/InputFields.avsc +1 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
- datahub/sql_parsing/datajob.py +50 -0
- datahub/sql_parsing/query_types.py +10 -1
- datahub/sql_parsing/split_statements.py +163 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
|
@@ -44,7 +44,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
44
44
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
45
45
|
from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
|
|
46
46
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
47
|
-
from datahub.ingestion.source.sql.
|
|
47
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
48
48
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
49
49
|
TwoTierSQLAlchemyConfig,
|
|
50
50
|
TwoTierSQLAlchemySource,
|
|
@@ -330,7 +330,7 @@ def optimized_get_view_definition(
|
|
|
330
330
|
|
|
331
331
|
|
|
332
332
|
@dataclass
|
|
333
|
-
class TeradataReport(
|
|
333
|
+
class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
|
|
334
334
|
num_queries_parsed: int = 0
|
|
335
335
|
num_view_ddl_parsed: int = 0
|
|
336
336
|
num_table_parse_failures: int = 0
|
|
@@ -27,7 +27,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
27
27
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
28
28
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
29
|
SQLAlchemySource,
|
|
30
|
-
SQLSourceReport,
|
|
31
30
|
SqlWorkUnit,
|
|
32
31
|
get_schema_metadata,
|
|
33
32
|
)
|
|
@@ -35,6 +34,7 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
35
34
|
BasicSQLAlchemyConfig,
|
|
36
35
|
SQLCommonConfig,
|
|
37
36
|
)
|
|
37
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
38
|
from datahub.ingestion.source.sql.sql_utils import get_domain_wu
|
|
39
39
|
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
40
40
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
@@ -536,7 +536,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
536
536
|
)
|
|
537
537
|
|
|
538
538
|
if not self.is_dataset_eligible_for_profiling(
|
|
539
|
-
dataset_name,
|
|
539
|
+
dataset_name, schema, inspector, profile_candidates
|
|
540
540
|
):
|
|
541
541
|
if self.config.profiling.report_dropped_profiles:
|
|
542
542
|
self.report.report_dropped(f"profile of {dataset_name}")
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.api.report import EntityFilterReport, Report
|
|
5
|
-
from datahub.ingestion.source.sql.
|
|
5
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
6
6
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
7
7
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
8
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
@@ -19,7 +19,7 @@ class UnityCatalogUsagePerfReport(Report):
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@dataclass
|
|
22
|
-
class UnityCatalogReport(IngestionStageReport,
|
|
22
|
+
class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
23
23
|
metastores: EntityFilterReport = EntityFilterReport.field(type="metastore")
|
|
24
24
|
catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog")
|
|
25
25
|
schemas: EntityFilterReport = EntityFilterReport.field(type="schema")
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, List, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
+
from datahub.metadata.schema_classes import (
|
|
6
|
+
DataJobInputOutputClass,
|
|
7
|
+
FineGrainedLineageClass,
|
|
8
|
+
UpstreamLineageClass,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def to_datajob_input_output(
|
|
15
|
+
*, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
|
|
16
|
+
) -> Optional[DataJobInputOutputClass]:
|
|
17
|
+
inputDatasets: List[str] = []
|
|
18
|
+
outputDatasets: List[str] = []
|
|
19
|
+
fineGrainedLineages: List[FineGrainedLineageClass] = []
|
|
20
|
+
for mcp in mcps:
|
|
21
|
+
# TODO: Represent simple write operations without lineage as outputDatasets.
|
|
22
|
+
|
|
23
|
+
upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
|
|
24
|
+
if upstream_lineage is not None:
|
|
25
|
+
if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
|
|
26
|
+
outputDatasets.append(mcp.entityUrn)
|
|
27
|
+
|
|
28
|
+
for upstream in upstream_lineage.upstreams:
|
|
29
|
+
if upstream.dataset not in inputDatasets:
|
|
30
|
+
inputDatasets.append(upstream.dataset)
|
|
31
|
+
|
|
32
|
+
if upstream_lineage.fineGrainedLineages:
|
|
33
|
+
for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
|
|
34
|
+
fineGrainedLineages.append(fineGrainedLineage)
|
|
35
|
+
|
|
36
|
+
elif ignore_extra_mcps:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if not inputDatasets and not outputDatasets:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return DataJobInputOutputClass(
|
|
47
|
+
inputDatasets=inputDatasets,
|
|
48
|
+
outputDatasets=outputDatasets,
|
|
49
|
+
fineGrainedLineages=fineGrainedLineages,
|
|
50
|
+
)
|
|
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
|
|
|
14
14
|
identifier: sqlglot.exp.Identifier = table.this
|
|
15
15
|
|
|
16
16
|
return identifier.args.get("temporary") or (
|
|
17
|
-
|
|
17
|
+
# These dialects use # as a prefix for temp tables.
|
|
18
|
+
is_dialect_instance(
|
|
19
|
+
dialect,
|
|
20
|
+
[
|
|
21
|
+
"redshift",
|
|
22
|
+
"mssql",
|
|
23
|
+
# sybase is another one, but we don't support that dialect yet.
|
|
24
|
+
],
|
|
25
|
+
)
|
|
26
|
+
and identifier.name.startswith("#")
|
|
18
27
|
)
|
|
19
28
|
|
|
20
29
|
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Generator, List, Tuple
|
|
4
|
+
|
|
5
|
+
CONTROL_FLOW_KEYWORDS = [
|
|
6
|
+
"GO",
|
|
7
|
+
r"BEGIN\w+TRY",
|
|
8
|
+
r"BEGIN\w+CATCH",
|
|
9
|
+
"BEGIN",
|
|
10
|
+
r"END\w+TRY",
|
|
11
|
+
r"END\w+CATCH",
|
|
12
|
+
"END",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# There's an exception to this rule, which is when the statement
|
|
16
|
+
# is preceeded by a CTE.
|
|
17
|
+
FORCE_NEW_STATEMENT_KEYWORDS = [
|
|
18
|
+
# SELECT is used inside queries as well, so we can't include it here.
|
|
19
|
+
"INSERT",
|
|
20
|
+
"UPDATE",
|
|
21
|
+
"DELETE",
|
|
22
|
+
"MERGE",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ParserState(Enum):
|
|
27
|
+
NORMAL = 1
|
|
28
|
+
STRING = 2
|
|
29
|
+
COMMENT = 3
|
|
30
|
+
MULTILINE_COMMENT = 4
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if a keyword exists at the given position using regex word boundaries.
|
|
36
|
+
"""
|
|
37
|
+
if pos + len(keyword) > len(sql):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
# If we're not at a word boundary, we can't generate a keyword.
|
|
41
|
+
if pos > 0 and not (
|
|
42
|
+
bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
|
|
43
|
+
or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
|
|
44
|
+
):
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
pattern = rf"^{re.escape(keyword)}\b"
|
|
48
|
+
match = re.match(pattern, sql[pos:], re.IGNORECASE)
|
|
49
|
+
return bool(match)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _look_ahead_for_keywords(
|
|
53
|
+
sql: str, pos: int, keywords: List[str]
|
|
54
|
+
) -> Tuple[bool, str, int]:
|
|
55
|
+
"""
|
|
56
|
+
Look ahead for SQL keywords at the current position.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
for keyword in keywords:
|
|
60
|
+
if _is_keyword_at_position(sql, pos, keyword):
|
|
61
|
+
return True, keyword, len(keyword)
|
|
62
|
+
return False, "", 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def split_statements(sql: str) -> Generator[str, None, None]:
|
|
66
|
+
"""
|
|
67
|
+
Split T-SQL code into individual statements, handling various SQL constructs.
|
|
68
|
+
"""
|
|
69
|
+
if not sql or not sql.strip():
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
current_statement: List[str] = []
|
|
73
|
+
state = ParserState.NORMAL
|
|
74
|
+
i = 0
|
|
75
|
+
|
|
76
|
+
def yield_if_complete() -> Generator[str, None, None]:
|
|
77
|
+
statement = "".join(current_statement).strip()
|
|
78
|
+
if statement:
|
|
79
|
+
yield statement
|
|
80
|
+
current_statement.clear()
|
|
81
|
+
|
|
82
|
+
prev_real_char = "\0" # the most recent non-whitespace, non-comment character
|
|
83
|
+
while i < len(sql):
|
|
84
|
+
c = sql[i]
|
|
85
|
+
next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
|
|
86
|
+
|
|
87
|
+
if state == ParserState.NORMAL:
|
|
88
|
+
if c == "'":
|
|
89
|
+
state = ParserState.STRING
|
|
90
|
+
current_statement.append(c)
|
|
91
|
+
prev_real_char = c
|
|
92
|
+
elif c == "-" and next_char == "-":
|
|
93
|
+
state = ParserState.COMMENT
|
|
94
|
+
current_statement.append(c)
|
|
95
|
+
current_statement.append(next_char)
|
|
96
|
+
i += 1
|
|
97
|
+
elif c == "/" and next_char == "*":
|
|
98
|
+
state = ParserState.MULTILINE_COMMENT
|
|
99
|
+
current_statement.append(c)
|
|
100
|
+
current_statement.append(next_char)
|
|
101
|
+
i += 1
|
|
102
|
+
else:
|
|
103
|
+
most_recent_real_char = prev_real_char
|
|
104
|
+
if not c.isspace():
|
|
105
|
+
prev_real_char = c
|
|
106
|
+
|
|
107
|
+
is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
|
|
108
|
+
sql, i, keywords=CONTROL_FLOW_KEYWORDS
|
|
109
|
+
)
|
|
110
|
+
if is_control_keyword:
|
|
111
|
+
# Yield current statement if any
|
|
112
|
+
yield from yield_if_complete()
|
|
113
|
+
# Yield keyword as its own statement
|
|
114
|
+
yield keyword
|
|
115
|
+
i += keyword_len
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
(
|
|
119
|
+
is_force_new_statement_keyword,
|
|
120
|
+
keyword,
|
|
121
|
+
keyword_len,
|
|
122
|
+
) = _look_ahead_for_keywords(
|
|
123
|
+
sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
|
|
124
|
+
)
|
|
125
|
+
if (
|
|
126
|
+
is_force_new_statement_keyword and most_recent_real_char != ")"
|
|
127
|
+
): # usually we'd have a close paren that closes a CTE
|
|
128
|
+
# Force termination of current statement
|
|
129
|
+
yield from yield_if_complete()
|
|
130
|
+
|
|
131
|
+
current_statement.append(keyword)
|
|
132
|
+
i += keyword_len
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
elif c == ";":
|
|
136
|
+
yield from yield_if_complete()
|
|
137
|
+
else:
|
|
138
|
+
current_statement.append(c)
|
|
139
|
+
|
|
140
|
+
elif state == ParserState.STRING:
|
|
141
|
+
current_statement.append(c)
|
|
142
|
+
if c == "'" and next_char == "'":
|
|
143
|
+
current_statement.append(next_char)
|
|
144
|
+
i += 1
|
|
145
|
+
elif c == "'":
|
|
146
|
+
state = ParserState.NORMAL
|
|
147
|
+
|
|
148
|
+
elif state == ParserState.COMMENT:
|
|
149
|
+
current_statement.append(c)
|
|
150
|
+
if c == "\n":
|
|
151
|
+
state = ParserState.NORMAL
|
|
152
|
+
|
|
153
|
+
elif state == ParserState.MULTILINE_COMMENT:
|
|
154
|
+
current_statement.append(c)
|
|
155
|
+
if c == "*" and next_char == "/":
|
|
156
|
+
current_statement.append(next_char)
|
|
157
|
+
i += 1
|
|
158
|
+
state = ParserState.NORMAL
|
|
159
|
+
|
|
160
|
+
i += 1
|
|
161
|
+
|
|
162
|
+
# Handle the last statement
|
|
163
|
+
yield from yield_if_complete()
|
|
File without changes
|
{acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|