acryl-datahub 0.14.1.13rc3__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show
  1. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/METADATA +2493 -2493
  2. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/RECORD +25 -20
  3. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt +2 -2
  4. datahub/__init__.py +1 -1
  5. datahub/configuration/kafka.py +12 -1
  6. datahub/configuration/kafka_consumer_config.py +35 -0
  7. datahub/ingestion/source/confluent_schema_registry.py +4 -2
  8. datahub/ingestion/source/kafka/__init__.py +0 -0
  9. datahub/ingestion/source/{kafka.py → kafka/kafka.py} +12 -2
  10. datahub/ingestion/source/sql/mssql/job_models.py +1 -0
  11. datahub/ingestion/source/sql/mssql/source.py +113 -38
  12. datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
  13. datahub/ingestion/source/sql/sql_common.py +10 -2
  14. datahub/sql_parsing/datajob.py +50 -0
  15. datahub/sql_parsing/query_types.py +10 -1
  16. datahub/sql_parsing/split_statements.py +163 -0
  17. datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
  18. datahub/sql_parsing/sql_parsing_common.py +6 -0
  19. datahub/sql_parsing/sqlglot_lineage.py +49 -7
  20. datahub/sql_parsing/sqlglot_utils.py +1 -1
  21. datahub/utilities/file_backed_collections.py +6 -0
  22. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/WHEEL +0 -0
  23. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/top_level.txt +0 -0
  24. /datahub/ingestion/source/{kafka_connect.py → kafka/kafka_connect.py} +0 -0
  25. /datahub/ingestion/source/{kafka_schema_registry_base.py → kafka/kafka_schema_registry_base.py} +0 -0
@@ -0,0 +1,50 @@
1
+ import logging
2
+ from typing import Iterable, List, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.metadata.schema_classes import (
6
+ DataJobInputOutputClass,
7
+ FineGrainedLineageClass,
8
+ UpstreamLineageClass,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def to_datajob_input_output(
15
+ *, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
16
+ ) -> Optional[DataJobInputOutputClass]:
17
+ inputDatasets: List[str] = []
18
+ outputDatasets: List[str] = []
19
+ fineGrainedLineages: List[FineGrainedLineageClass] = []
20
+ for mcp in mcps:
21
+ # TODO: Represent simple write operations without lineage as outputDatasets.
22
+
23
+ upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
24
+ if upstream_lineage is not None:
25
+ if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
26
+ outputDatasets.append(mcp.entityUrn)
27
+
28
+ for upstream in upstream_lineage.upstreams:
29
+ if upstream.dataset not in inputDatasets:
30
+ inputDatasets.append(upstream.dataset)
31
+
32
+ if upstream_lineage.fineGrainedLineages:
33
+ for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
34
+ fineGrainedLineages.append(fineGrainedLineage)
35
+
36
+ elif ignore_extra_mcps:
37
+ pass
38
+ else:
39
+ raise ValueError(
40
+ f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
41
+ )
42
+
43
+ if not inputDatasets and not outputDatasets:
44
+ return None
45
+
46
+ return DataJobInputOutputClass(
47
+ inputDatasets=inputDatasets,
48
+ outputDatasets=outputDatasets,
49
+ fineGrainedLineages=fineGrainedLineages,
50
+ )
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
14
14
  identifier: sqlglot.exp.Identifier = table.this
15
15
 
16
16
  return identifier.args.get("temporary") or (
17
- is_dialect_instance(dialect, "redshift") and identifier.name.startswith("#")
17
+ # These dialects use # as a prefix for temp tables.
18
+ is_dialect_instance(
19
+ dialect,
20
+ [
21
+ "redshift",
22
+ "mssql",
23
+ # sybase is another one, but we don't support that dialect yet.
24
+ ],
25
+ )
26
+ and identifier.name.startswith("#")
18
27
  )
19
28
 
20
29
 
@@ -0,0 +1,163 @@
1
+ import re
2
+ from enum import Enum
3
+ from typing import Generator, List, Tuple
4
+
5
+ CONTROL_FLOW_KEYWORDS = [
6
+ "GO",
7
+ r"BEGIN\w+TRY",
8
+ r"BEGIN\w+CATCH",
9
+ "BEGIN",
10
+ r"END\w+TRY",
11
+ r"END\w+CATCH",
12
+ "END",
13
+ ]
14
+
15
+ # There's an exception to this rule, which is when the statement
16
+ # is preceeded by a CTE.
17
+ FORCE_NEW_STATEMENT_KEYWORDS = [
18
+ # SELECT is used inside queries as well, so we can't include it here.
19
+ "INSERT",
20
+ "UPDATE",
21
+ "DELETE",
22
+ "MERGE",
23
+ ]
24
+
25
+
26
+ class ParserState(Enum):
27
+ NORMAL = 1
28
+ STRING = 2
29
+ COMMENT = 3
30
+ MULTILINE_COMMENT = 4
31
+
32
+
33
+ def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
34
+ """
35
+ Check if a keyword exists at the given position using regex word boundaries.
36
+ """
37
+ if pos + len(keyword) > len(sql):
38
+ return False
39
+
40
+ # If we're not at a word boundary, we can't generate a keyword.
41
+ if pos > 0 and not (
42
+ bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
43
+ or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
44
+ ):
45
+ return False
46
+
47
+ pattern = rf"^{re.escape(keyword)}\b"
48
+ match = re.match(pattern, sql[pos:], re.IGNORECASE)
49
+ return bool(match)
50
+
51
+
52
+ def _look_ahead_for_keywords(
53
+ sql: str, pos: int, keywords: List[str]
54
+ ) -> Tuple[bool, str, int]:
55
+ """
56
+ Look ahead for SQL keywords at the current position.
57
+ """
58
+
59
+ for keyword in keywords:
60
+ if _is_keyword_at_position(sql, pos, keyword):
61
+ return True, keyword, len(keyword)
62
+ return False, "", 0
63
+
64
+
65
+ def split_statements(sql: str) -> Generator[str, None, None]:
66
+ """
67
+ Split T-SQL code into individual statements, handling various SQL constructs.
68
+ """
69
+ if not sql or not sql.strip():
70
+ return
71
+
72
+ current_statement: List[str] = []
73
+ state = ParserState.NORMAL
74
+ i = 0
75
+
76
+ def yield_if_complete() -> Generator[str, None, None]:
77
+ statement = "".join(current_statement).strip()
78
+ if statement:
79
+ yield statement
80
+ current_statement.clear()
81
+
82
+ prev_real_char = "\0" # the most recent non-whitespace, non-comment character
83
+ while i < len(sql):
84
+ c = sql[i]
85
+ next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
86
+
87
+ if state == ParserState.NORMAL:
88
+ if c == "'":
89
+ state = ParserState.STRING
90
+ current_statement.append(c)
91
+ prev_real_char = c
92
+ elif c == "-" and next_char == "-":
93
+ state = ParserState.COMMENT
94
+ current_statement.append(c)
95
+ current_statement.append(next_char)
96
+ i += 1
97
+ elif c == "/" and next_char == "*":
98
+ state = ParserState.MULTILINE_COMMENT
99
+ current_statement.append(c)
100
+ current_statement.append(next_char)
101
+ i += 1
102
+ else:
103
+ most_recent_real_char = prev_real_char
104
+ if not c.isspace():
105
+ prev_real_char = c
106
+
107
+ is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
108
+ sql, i, keywords=CONTROL_FLOW_KEYWORDS
109
+ )
110
+ if is_control_keyword:
111
+ # Yield current statement if any
112
+ yield from yield_if_complete()
113
+ # Yield keyword as its own statement
114
+ yield keyword
115
+ i += keyword_len
116
+ continue
117
+
118
+ (
119
+ is_force_new_statement_keyword,
120
+ keyword,
121
+ keyword_len,
122
+ ) = _look_ahead_for_keywords(
123
+ sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
124
+ )
125
+ if (
126
+ is_force_new_statement_keyword and most_recent_real_char != ")"
127
+ ): # usually we'd have a close paren that closes a CTE
128
+ # Force termination of current statement
129
+ yield from yield_if_complete()
130
+
131
+ current_statement.append(keyword)
132
+ i += keyword_len
133
+ continue
134
+
135
+ elif c == ";":
136
+ yield from yield_if_complete()
137
+ else:
138
+ current_statement.append(c)
139
+
140
+ elif state == ParserState.STRING:
141
+ current_statement.append(c)
142
+ if c == "'" and next_char == "'":
143
+ current_statement.append(next_char)
144
+ i += 1
145
+ elif c == "'":
146
+ state = ParserState.NORMAL
147
+
148
+ elif state == ParserState.COMMENT:
149
+ current_statement.append(c)
150
+ if c == "\n":
151
+ state = ParserState.NORMAL
152
+
153
+ elif state == ParserState.MULTILINE_COMMENT:
154
+ current_statement.append(c)
155
+ if c == "*" and next_char == "/":
156
+ current_statement.append(next_char)
157
+ i += 1
158
+ state = ParserState.NORMAL
159
+
160
+ i += 1
161
+
162
+ # Handle the last statement
163
+ yield from yield_if_complete()
@@ -762,7 +762,6 @@ class SqlParsingAggregator(Closeable):
762
762
 
763
763
  This assumes that queries come in order of increasing timestamps.
764
764
  """
765
-
766
765
  self.report.num_observed_queries += 1
767
766
 
768
767
  # All queries with no session ID are assumed to be part of the same session.
@@ -21,6 +21,9 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
21
21
  # See more below:
22
22
  # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm
23
23
  "teradata",
24
+ # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
25
+ # are case preserving but case insensitive.
26
+ "mssql",
24
27
  }
25
28
  DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
26
29
  # In some dialects, column identifiers are effectively case insensitive
@@ -28,6 +31,9 @@ DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
28
31
  # automatically lowercase unquoted identifiers.
29
32
  "snowflake",
30
33
  }
34
+ assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
35
+ DIALECTS_WITH_CASE_INSENSITIVE_COLS
36
+ )
31
37
 
32
38
 
33
39
  class QueryType(enum.Enum):
@@ -5,7 +5,7 @@ import functools
5
5
  import logging
6
6
  import traceback
7
7
  from collections import defaultdict
8
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
9
9
 
10
10
  import pydantic.dataclasses
11
11
  import sqlglot
@@ -873,6 +873,49 @@ def _translate_internal_column_lineage(
873
873
  )
874
874
 
875
875
 
876
+ _StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
877
+
878
+
879
+ def _normalize_db_or_schema(
880
+ db_or_schema: _StrOrNone,
881
+ dialect: sqlglot.Dialect,
882
+ ) -> _StrOrNone:
883
+ if db_or_schema is None:
884
+ return None
885
+
886
+ # In snowflake, table identifiers must be uppercased to match sqlglot's behavior.
887
+ if is_dialect_instance(dialect, "snowflake"):
888
+ return db_or_schema.upper()
889
+
890
+ # In mssql, table identifiers must be lowercased.
891
+ elif is_dialect_instance(dialect, "mssql"):
892
+ return db_or_schema.lower()
893
+
894
+ return db_or_schema
895
+
896
+
897
+ def _simplify_select_into(statement: sqlglot.exp.Expression) -> sqlglot.exp.Expression:
898
+ """
899
+ Check if the expression is a SELECT INTO statement. If so, converts it into a CTAS.
900
+ Other expressions are returned as-is.
901
+ """
902
+
903
+ if not (isinstance(statement, sqlglot.exp.Select) and statement.args.get("into")):
904
+ return statement
905
+
906
+ # Convert from SELECT <cols> INTO <out> <expr>
907
+ # to CREATE TABLE <out> AS SELECT <cols> <expr>
908
+ into_expr: sqlglot.exp.Into = statement.args["into"].pop()
909
+ into_table = into_expr.this
910
+
911
+ create = sqlglot.exp.Create(
912
+ this=into_table,
913
+ kind="TABLE",
914
+ expression=statement,
915
+ )
916
+ return create
917
+
918
+
876
919
  def _sqlglot_lineage_inner(
877
920
  sql: sqlglot.exp.ExpOrStr,
878
921
  schema_resolver: SchemaResolverInterface,
@@ -885,12 +928,9 @@ def _sqlglot_lineage_inner(
885
928
  else:
886
929
  dialect = get_dialect(default_dialect)
887
930
 
888
- if is_dialect_instance(dialect, "snowflake"):
889
- # in snowflake, table identifiers must be uppercased to match sqlglot's behavior.
890
- if default_db:
891
- default_db = default_db.upper()
892
- if default_schema:
893
- default_schema = default_schema.upper()
931
+ default_db = _normalize_db_or_schema(default_db, dialect)
932
+ default_schema = _normalize_db_or_schema(default_schema, dialect)
933
+
894
934
  if is_dialect_instance(dialect, "redshift") and not default_schema:
895
935
  # On Redshift, there's no "USE SCHEMA <schema>" command. The default schema
896
936
  # is public, and "current schema" is the one at the front of the search path.
@@ -918,6 +958,8 @@ def _sqlglot_lineage_inner(
918
958
  # original_statement.sql(pretty=True, dialect=dialect),
919
959
  # )
920
960
 
961
+ statement = _simplify_select_into(statement)
962
+
921
963
  # Make sure the tables are resolved with the default db / schema.
922
964
  # This only works for Unionable statements. For other types of statements,
923
965
  # we have to do it manually afterwards, but that's slightly lower accuracy
@@ -61,7 +61,7 @@ def is_dialect_instance(
61
61
  else:
62
62
  platforms = list(platforms)
63
63
 
64
- dialects = [sqlglot.Dialect.get_or_raise(platform) for platform in platforms]
64
+ dialects = [get_dialect(platform) for platform in platforms]
65
65
 
66
66
  if any(isinstance(dialect, dialect_class.__class__) for dialect_class in dialects):
67
67
  return True
@@ -228,6 +228,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
228
228
  else:
229
229
  self._conn = ConnectionWrapper()
230
230
 
231
+ if sqlite3.sqlite_version_info < (3, 24, 0):
232
+ # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
233
+ # This was added in 3.24.0 from 2018-06-04.
234
+ # See https://www.sqlite.org/lang_conflict.html
235
+ raise RuntimeError("SQLite version 3.24.0 or later is required")
236
+
231
237
  # We keep a small cache in memory to avoid having to serialize/deserialize
232
238
  # data from the database too often. We use an OrderedDict to build
233
239
  # a poor-man's LRU cache.