acryl-datahub 0.14.1.13rc3__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/METADATA +2493 -2493
- {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/RECORD +25 -20
- {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt +2 -2
- datahub/__init__.py +1 -1
- datahub/configuration/kafka.py +12 -1
- datahub/configuration/kafka_consumer_config.py +35 -0
- datahub/ingestion/source/confluent_schema_registry.py +4 -2
- datahub/ingestion/source/kafka/__init__.py +0 -0
- datahub/ingestion/source/{kafka.py → kafka/kafka.py} +12 -2
- datahub/ingestion/source/sql/mssql/job_models.py +1 -0
- datahub/ingestion/source/sql/mssql/source.py +113 -38
- datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
- datahub/ingestion/source/sql/sql_common.py +10 -2
- datahub/sql_parsing/datajob.py +50 -0
- datahub/sql_parsing/query_types.py +10 -1
- datahub/sql_parsing/split_statements.py +163 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
- datahub/sql_parsing/sql_parsing_common.py +6 -0
- datahub/sql_parsing/sqlglot_lineage.py +49 -7
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -0
- {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/top_level.txt +0 -0
- /datahub/ingestion/source/{kafka_connect.py → kafka/kafka_connect.py} +0 -0
- /datahub/ingestion/source/{kafka_schema_registry_base.py → kafka/kafka_schema_registry_base.py} +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, List, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
+
from datahub.metadata.schema_classes import (
|
|
6
|
+
DataJobInputOutputClass,
|
|
7
|
+
FineGrainedLineageClass,
|
|
8
|
+
UpstreamLineageClass,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def to_datajob_input_output(
|
|
15
|
+
*, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
|
|
16
|
+
) -> Optional[DataJobInputOutputClass]:
|
|
17
|
+
inputDatasets: List[str] = []
|
|
18
|
+
outputDatasets: List[str] = []
|
|
19
|
+
fineGrainedLineages: List[FineGrainedLineageClass] = []
|
|
20
|
+
for mcp in mcps:
|
|
21
|
+
# TODO: Represent simple write operations without lineage as outputDatasets.
|
|
22
|
+
|
|
23
|
+
upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
|
|
24
|
+
if upstream_lineage is not None:
|
|
25
|
+
if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
|
|
26
|
+
outputDatasets.append(mcp.entityUrn)
|
|
27
|
+
|
|
28
|
+
for upstream in upstream_lineage.upstreams:
|
|
29
|
+
if upstream.dataset not in inputDatasets:
|
|
30
|
+
inputDatasets.append(upstream.dataset)
|
|
31
|
+
|
|
32
|
+
if upstream_lineage.fineGrainedLineages:
|
|
33
|
+
for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
|
|
34
|
+
fineGrainedLineages.append(fineGrainedLineage)
|
|
35
|
+
|
|
36
|
+
elif ignore_extra_mcps:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if not inputDatasets and not outputDatasets:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return DataJobInputOutputClass(
|
|
47
|
+
inputDatasets=inputDatasets,
|
|
48
|
+
outputDatasets=outputDatasets,
|
|
49
|
+
fineGrainedLineages=fineGrainedLineages,
|
|
50
|
+
)
|
|
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
|
|
|
14
14
|
identifier: sqlglot.exp.Identifier = table.this
|
|
15
15
|
|
|
16
16
|
return identifier.args.get("temporary") or (
|
|
17
|
-
|
|
17
|
+
# These dialects use # as a prefix for temp tables.
|
|
18
|
+
is_dialect_instance(
|
|
19
|
+
dialect,
|
|
20
|
+
[
|
|
21
|
+
"redshift",
|
|
22
|
+
"mssql",
|
|
23
|
+
# sybase is another one, but we don't support that dialect yet.
|
|
24
|
+
],
|
|
25
|
+
)
|
|
26
|
+
and identifier.name.startswith("#")
|
|
18
27
|
)
|
|
19
28
|
|
|
20
29
|
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Generator, List, Tuple
|
|
4
|
+
|
|
5
|
+
CONTROL_FLOW_KEYWORDS = [
|
|
6
|
+
"GO",
|
|
7
|
+
r"BEGIN\w+TRY",
|
|
8
|
+
r"BEGIN\w+CATCH",
|
|
9
|
+
"BEGIN",
|
|
10
|
+
r"END\w+TRY",
|
|
11
|
+
r"END\w+CATCH",
|
|
12
|
+
"END",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# There's an exception to this rule, which is when the statement
|
|
16
|
+
# is preceeded by a CTE.
|
|
17
|
+
FORCE_NEW_STATEMENT_KEYWORDS = [
|
|
18
|
+
# SELECT is used inside queries as well, so we can't include it here.
|
|
19
|
+
"INSERT",
|
|
20
|
+
"UPDATE",
|
|
21
|
+
"DELETE",
|
|
22
|
+
"MERGE",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ParserState(Enum):
|
|
27
|
+
NORMAL = 1
|
|
28
|
+
STRING = 2
|
|
29
|
+
COMMENT = 3
|
|
30
|
+
MULTILINE_COMMENT = 4
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if a keyword exists at the given position using regex word boundaries.
|
|
36
|
+
"""
|
|
37
|
+
if pos + len(keyword) > len(sql):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
# If we're not at a word boundary, we can't generate a keyword.
|
|
41
|
+
if pos > 0 and not (
|
|
42
|
+
bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
|
|
43
|
+
or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
|
|
44
|
+
):
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
pattern = rf"^{re.escape(keyword)}\b"
|
|
48
|
+
match = re.match(pattern, sql[pos:], re.IGNORECASE)
|
|
49
|
+
return bool(match)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _look_ahead_for_keywords(
|
|
53
|
+
sql: str, pos: int, keywords: List[str]
|
|
54
|
+
) -> Tuple[bool, str, int]:
|
|
55
|
+
"""
|
|
56
|
+
Look ahead for SQL keywords at the current position.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
for keyword in keywords:
|
|
60
|
+
if _is_keyword_at_position(sql, pos, keyword):
|
|
61
|
+
return True, keyword, len(keyword)
|
|
62
|
+
return False, "", 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def split_statements(sql: str) -> Generator[str, None, None]:
|
|
66
|
+
"""
|
|
67
|
+
Split T-SQL code into individual statements, handling various SQL constructs.
|
|
68
|
+
"""
|
|
69
|
+
if not sql or not sql.strip():
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
current_statement: List[str] = []
|
|
73
|
+
state = ParserState.NORMAL
|
|
74
|
+
i = 0
|
|
75
|
+
|
|
76
|
+
def yield_if_complete() -> Generator[str, None, None]:
|
|
77
|
+
statement = "".join(current_statement).strip()
|
|
78
|
+
if statement:
|
|
79
|
+
yield statement
|
|
80
|
+
current_statement.clear()
|
|
81
|
+
|
|
82
|
+
prev_real_char = "\0" # the most recent non-whitespace, non-comment character
|
|
83
|
+
while i < len(sql):
|
|
84
|
+
c = sql[i]
|
|
85
|
+
next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
|
|
86
|
+
|
|
87
|
+
if state == ParserState.NORMAL:
|
|
88
|
+
if c == "'":
|
|
89
|
+
state = ParserState.STRING
|
|
90
|
+
current_statement.append(c)
|
|
91
|
+
prev_real_char = c
|
|
92
|
+
elif c == "-" and next_char == "-":
|
|
93
|
+
state = ParserState.COMMENT
|
|
94
|
+
current_statement.append(c)
|
|
95
|
+
current_statement.append(next_char)
|
|
96
|
+
i += 1
|
|
97
|
+
elif c == "/" and next_char == "*":
|
|
98
|
+
state = ParserState.MULTILINE_COMMENT
|
|
99
|
+
current_statement.append(c)
|
|
100
|
+
current_statement.append(next_char)
|
|
101
|
+
i += 1
|
|
102
|
+
else:
|
|
103
|
+
most_recent_real_char = prev_real_char
|
|
104
|
+
if not c.isspace():
|
|
105
|
+
prev_real_char = c
|
|
106
|
+
|
|
107
|
+
is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
|
|
108
|
+
sql, i, keywords=CONTROL_FLOW_KEYWORDS
|
|
109
|
+
)
|
|
110
|
+
if is_control_keyword:
|
|
111
|
+
# Yield current statement if any
|
|
112
|
+
yield from yield_if_complete()
|
|
113
|
+
# Yield keyword as its own statement
|
|
114
|
+
yield keyword
|
|
115
|
+
i += keyword_len
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
(
|
|
119
|
+
is_force_new_statement_keyword,
|
|
120
|
+
keyword,
|
|
121
|
+
keyword_len,
|
|
122
|
+
) = _look_ahead_for_keywords(
|
|
123
|
+
sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
|
|
124
|
+
)
|
|
125
|
+
if (
|
|
126
|
+
is_force_new_statement_keyword and most_recent_real_char != ")"
|
|
127
|
+
): # usually we'd have a close paren that closes a CTE
|
|
128
|
+
# Force termination of current statement
|
|
129
|
+
yield from yield_if_complete()
|
|
130
|
+
|
|
131
|
+
current_statement.append(keyword)
|
|
132
|
+
i += keyword_len
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
elif c == ";":
|
|
136
|
+
yield from yield_if_complete()
|
|
137
|
+
else:
|
|
138
|
+
current_statement.append(c)
|
|
139
|
+
|
|
140
|
+
elif state == ParserState.STRING:
|
|
141
|
+
current_statement.append(c)
|
|
142
|
+
if c == "'" and next_char == "'":
|
|
143
|
+
current_statement.append(next_char)
|
|
144
|
+
i += 1
|
|
145
|
+
elif c == "'":
|
|
146
|
+
state = ParserState.NORMAL
|
|
147
|
+
|
|
148
|
+
elif state == ParserState.COMMENT:
|
|
149
|
+
current_statement.append(c)
|
|
150
|
+
if c == "\n":
|
|
151
|
+
state = ParserState.NORMAL
|
|
152
|
+
|
|
153
|
+
elif state == ParserState.MULTILINE_COMMENT:
|
|
154
|
+
current_statement.append(c)
|
|
155
|
+
if c == "*" and next_char == "/":
|
|
156
|
+
current_statement.append(next_char)
|
|
157
|
+
i += 1
|
|
158
|
+
state = ParserState.NORMAL
|
|
159
|
+
|
|
160
|
+
i += 1
|
|
161
|
+
|
|
162
|
+
# Handle the last statement
|
|
163
|
+
yield from yield_if_complete()
|
|
@@ -21,6 +21,9 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
|
|
|
21
21
|
# See more below:
|
|
22
22
|
# https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm
|
|
23
23
|
"teradata",
|
|
24
|
+
# For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
|
|
25
|
+
# are case preserving but case insensitive.
|
|
26
|
+
"mssql",
|
|
24
27
|
}
|
|
25
28
|
DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
|
|
26
29
|
# In some dialects, column identifiers are effectively case insensitive
|
|
@@ -28,6 +31,9 @@ DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
|
|
|
28
31
|
# automatically lowercase unquoted identifiers.
|
|
29
32
|
"snowflake",
|
|
30
33
|
}
|
|
34
|
+
assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
|
|
35
|
+
DIALECTS_WITH_CASE_INSENSITIVE_COLS
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
|
|
33
39
|
class QueryType(enum.Enum):
|
|
@@ -5,7 +5,7 @@ import functools
|
|
|
5
5
|
import logging
|
|
6
6
|
import traceback
|
|
7
7
|
from collections import defaultdict
|
|
8
|
-
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
|
|
9
9
|
|
|
10
10
|
import pydantic.dataclasses
|
|
11
11
|
import sqlglot
|
|
@@ -873,6 +873,49 @@ def _translate_internal_column_lineage(
|
|
|
873
873
|
)
|
|
874
874
|
|
|
875
875
|
|
|
876
|
+
_StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def _normalize_db_or_schema(
|
|
880
|
+
db_or_schema: _StrOrNone,
|
|
881
|
+
dialect: sqlglot.Dialect,
|
|
882
|
+
) -> _StrOrNone:
|
|
883
|
+
if db_or_schema is None:
|
|
884
|
+
return None
|
|
885
|
+
|
|
886
|
+
# In snowflake, table identifiers must be uppercased to match sqlglot's behavior.
|
|
887
|
+
if is_dialect_instance(dialect, "snowflake"):
|
|
888
|
+
return db_or_schema.upper()
|
|
889
|
+
|
|
890
|
+
# In mssql, table identifiers must be lowercased.
|
|
891
|
+
elif is_dialect_instance(dialect, "mssql"):
|
|
892
|
+
return db_or_schema.lower()
|
|
893
|
+
|
|
894
|
+
return db_or_schema
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def _simplify_select_into(statement: sqlglot.exp.Expression) -> sqlglot.exp.Expression:
|
|
898
|
+
"""
|
|
899
|
+
Check if the expression is a SELECT INTO statement. If so, converts it into a CTAS.
|
|
900
|
+
Other expressions are returned as-is.
|
|
901
|
+
"""
|
|
902
|
+
|
|
903
|
+
if not (isinstance(statement, sqlglot.exp.Select) and statement.args.get("into")):
|
|
904
|
+
return statement
|
|
905
|
+
|
|
906
|
+
# Convert from SELECT <cols> INTO <out> <expr>
|
|
907
|
+
# to CREATE TABLE <out> AS SELECT <cols> <expr>
|
|
908
|
+
into_expr: sqlglot.exp.Into = statement.args["into"].pop()
|
|
909
|
+
into_table = into_expr.this
|
|
910
|
+
|
|
911
|
+
create = sqlglot.exp.Create(
|
|
912
|
+
this=into_table,
|
|
913
|
+
kind="TABLE",
|
|
914
|
+
expression=statement,
|
|
915
|
+
)
|
|
916
|
+
return create
|
|
917
|
+
|
|
918
|
+
|
|
876
919
|
def _sqlglot_lineage_inner(
|
|
877
920
|
sql: sqlglot.exp.ExpOrStr,
|
|
878
921
|
schema_resolver: SchemaResolverInterface,
|
|
@@ -885,12 +928,9 @@ def _sqlglot_lineage_inner(
|
|
|
885
928
|
else:
|
|
886
929
|
dialect = get_dialect(default_dialect)
|
|
887
930
|
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
default_db = default_db.upper()
|
|
892
|
-
if default_schema:
|
|
893
|
-
default_schema = default_schema.upper()
|
|
931
|
+
default_db = _normalize_db_or_schema(default_db, dialect)
|
|
932
|
+
default_schema = _normalize_db_or_schema(default_schema, dialect)
|
|
933
|
+
|
|
894
934
|
if is_dialect_instance(dialect, "redshift") and not default_schema:
|
|
895
935
|
# On Redshift, there's no "USE SCHEMA <schema>" command. The default schema
|
|
896
936
|
# is public, and "current schema" is the one at the front of the search path.
|
|
@@ -918,6 +958,8 @@ def _sqlglot_lineage_inner(
|
|
|
918
958
|
# original_statement.sql(pretty=True, dialect=dialect),
|
|
919
959
|
# )
|
|
920
960
|
|
|
961
|
+
statement = _simplify_select_into(statement)
|
|
962
|
+
|
|
921
963
|
# Make sure the tables are resolved with the default db / schema.
|
|
922
964
|
# This only works for Unionable statements. For other types of statements,
|
|
923
965
|
# we have to do it manually afterwards, but that's slightly lower accuracy
|
|
@@ -61,7 +61,7 @@ def is_dialect_instance(
|
|
|
61
61
|
else:
|
|
62
62
|
platforms = list(platforms)
|
|
63
63
|
|
|
64
|
-
dialects = [
|
|
64
|
+
dialects = [get_dialect(platform) for platform in platforms]
|
|
65
65
|
|
|
66
66
|
if any(isinstance(dialect, dialect_class.__class__) for dialect_class in dialects):
|
|
67
67
|
return True
|
|
@@ -228,6 +228,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
228
228
|
else:
|
|
229
229
|
self._conn = ConnectionWrapper()
|
|
230
230
|
|
|
231
|
+
if sqlite3.sqlite_version_info < (3, 24, 0):
|
|
232
|
+
# We use the ON CONFLICT clause to implement UPSERTs with sqlite.
|
|
233
|
+
# This was added in 3.24.0 from 2018-06-04.
|
|
234
|
+
# See https://www.sqlite.org/lang_conflict.html
|
|
235
|
+
raise RuntimeError("SQLite version 3.24.0 or later is required")
|
|
236
|
+
|
|
231
237
|
# We keep a small cache in memory to avoid having to serialize/deserialize
|
|
232
238
|
# data from the database too often. We use an OrderedDict to build
|
|
233
239
|
# a poor-man's LRU cache.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
/datahub/ingestion/source/{kafka_schema_registry_base.py → kafka/kafka_schema_registry_base.py}
RENAMED
|
File without changes
|