acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +34 -30
  3. datahub/__init__.py +1 -1
  4. datahub/configuration/kafka_consumer_config.py +4 -1
  5. datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
  6. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
  8. datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
  9. datahub/ingestion/source/ge_data_profiler.py +1 -1
  10. datahub/ingestion/source/ge_profiling_config.py +6 -2
  11. datahub/ingestion/source/redshift/report.py +2 -2
  12. datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
  13. datahub/ingestion/source/sql/mssql/job_models.py +1 -0
  14. datahub/ingestion/source/sql/mssql/source.py +113 -38
  15. datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
  16. datahub/ingestion/source/sql/oracle.py +50 -0
  17. datahub/ingestion/source/sql/sql_common.py +28 -54
  18. datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
  19. datahub/ingestion/source/sql/sql_report.py +75 -0
  20. datahub/ingestion/source/sql/teradata.py +2 -2
  21. datahub/ingestion/source/sql/vertica.py +2 -2
  22. datahub/ingestion/source/unity/report.py +2 -2
  23. datahub/metadata/schema.avsc +1 -1
  24. datahub/metadata/schemas/AssertionInfo.avsc +1 -1
  25. datahub/metadata/schemas/InputFields.avsc +1 -1
  26. datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
  27. datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
  28. datahub/sql_parsing/datajob.py +50 -0
  29. datahub/sql_parsing/query_types.py +10 -1
  30. datahub/sql_parsing/split_statements.py +163 -0
  31. datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
  32. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
  33. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
  34. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
@@ -44,7 +44,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
44
44
  from datahub.ingestion.graph.client import DataHubGraph
45
45
  from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
46
46
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
47
- from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
47
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
48
48
  from datahub.ingestion.source.sql.two_tier_sql_source import (
49
49
  TwoTierSQLAlchemyConfig,
50
50
  TwoTierSQLAlchemySource,
@@ -330,7 +330,7 @@ def optimized_get_view_definition(
330
330
 
331
331
 
332
332
  @dataclass
333
- class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport):
333
+ class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
334
334
  num_queries_parsed: int = 0
335
335
  num_view_ddl_parsed: int = 0
336
336
  num_table_parse_failures: int = 0
@@ -27,7 +27,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
28
  from datahub.ingestion.source.sql.sql_common import (
29
29
  SQLAlchemySource,
30
- SQLSourceReport,
31
30
  SqlWorkUnit,
32
31
  get_schema_metadata,
33
32
  )
@@ -35,6 +34,7 @@ from datahub.ingestion.source.sql.sql_config import (
35
34
  BasicSQLAlchemyConfig,
36
35
  SQLCommonConfig,
37
36
  )
37
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
38
38
  from datahub.ingestion.source.sql.sql_utils import get_domain_wu
39
39
  from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
40
40
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
@@ -536,7 +536,7 @@ class VerticaSource(SQLAlchemySource):
536
536
  )
537
537
 
538
538
  if not self.is_dataset_eligible_for_profiling(
539
- dataset_name, sql_config, inspector, profile_candidates
539
+ dataset_name, schema, inspector, profile_candidates
540
540
  ):
541
541
  if self.config.profiling.report_dropped_profiles:
542
542
  self.report.report_dropped(f"profile of {dataset_name}")
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
2
2
  from typing import Optional, Tuple
3
3
 
4
4
  from datahub.ingestion.api.report import EntityFilterReport, Report
5
- from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
5
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
6
6
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
7
7
  from datahub.utilities.lossy_collections import LossyDict, LossyList
8
8
  from datahub.utilities.perf_timer import PerfTimer
@@ -19,7 +19,7 @@ class UnityCatalogUsagePerfReport(Report):
19
19
 
20
20
 
21
21
  @dataclass
22
- class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport):
22
+ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
23
23
  metastores: EntityFilterReport = EntityFilterReport.field(type="metastore")
24
24
  catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog")
25
25
  schemas: EntityFilterReport = EntityFilterReport.field(type="schema")
@@ -6005,7 +6005,7 @@
6005
6005
  "fields": [
6006
6006
  {
6007
6007
  "Searchable": {
6008
- "boostScore": 5.0,
6008
+ "boostScore": 1.0,
6009
6009
  "fieldName": "fieldPaths",
6010
6010
  "fieldType": "TEXT",
6011
6011
  "queryByDefault": "true"
@@ -1542,7 +1542,7 @@
1542
1542
  "fields": [
1543
1543
  {
1544
1544
  "Searchable": {
1545
- "boostScore": 5.0,
1545
+ "boostScore": 1.0,
1546
1546
  "fieldName": "fieldPaths",
1547
1547
  "fieldType": "TEXT",
1548
1548
  "queryByDefault": "true"
@@ -42,7 +42,7 @@
42
42
  "fields": [
43
43
  {
44
44
  "Searchable": {
45
- "boostScore": 5.0,
45
+ "boostScore": 1.0,
46
46
  "fieldName": "fieldPaths",
47
47
  "fieldType": "TEXT",
48
48
  "queryByDefault": "true"
@@ -4013,7 +4013,7 @@
4013
4013
  "fields": [
4014
4014
  {
4015
4015
  "Searchable": {
4016
- "boostScore": 5.0,
4016
+ "boostScore": 1.0,
4017
4017
  "fieldName": "fieldPaths",
4018
4018
  "fieldType": "TEXT",
4019
4019
  "queryByDefault": "true"
@@ -309,7 +309,7 @@
309
309
  "fields": [
310
310
  {
311
311
  "Searchable": {
312
- "boostScore": 5.0,
312
+ "boostScore": 1.0,
313
313
  "fieldName": "fieldPaths",
314
314
  "fieldType": "TEXT",
315
315
  "queryByDefault": "true"
@@ -0,0 +1,50 @@
1
+ import logging
2
+ from typing import Iterable, List, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.metadata.schema_classes import (
6
+ DataJobInputOutputClass,
7
+ FineGrainedLineageClass,
8
+ UpstreamLineageClass,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def to_datajob_input_output(
15
+ *, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
16
+ ) -> Optional[DataJobInputOutputClass]:
17
+ inputDatasets: List[str] = []
18
+ outputDatasets: List[str] = []
19
+ fineGrainedLineages: List[FineGrainedLineageClass] = []
20
+ for mcp in mcps:
21
+ # TODO: Represent simple write operations without lineage as outputDatasets.
22
+
23
+ upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
24
+ if upstream_lineage is not None:
25
+ if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
26
+ outputDatasets.append(mcp.entityUrn)
27
+
28
+ for upstream in upstream_lineage.upstreams:
29
+ if upstream.dataset not in inputDatasets:
30
+ inputDatasets.append(upstream.dataset)
31
+
32
+ if upstream_lineage.fineGrainedLineages:
33
+ for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
34
+ fineGrainedLineages.append(fineGrainedLineage)
35
+
36
+ elif ignore_extra_mcps:
37
+ pass
38
+ else:
39
+ raise ValueError(
40
+ f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
41
+ )
42
+
43
+ if not inputDatasets and not outputDatasets:
44
+ return None
45
+
46
+ return DataJobInputOutputClass(
47
+ inputDatasets=inputDatasets,
48
+ outputDatasets=outputDatasets,
49
+ fineGrainedLineages=fineGrainedLineages,
50
+ )
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
14
14
  identifier: sqlglot.exp.Identifier = table.this
15
15
 
16
16
  return identifier.args.get("temporary") or (
17
- is_dialect_instance(dialect, "redshift") and identifier.name.startswith("#")
17
+ # These dialects use # as a prefix for temp tables.
18
+ is_dialect_instance(
19
+ dialect,
20
+ [
21
+ "redshift",
22
+ "mssql",
23
+ # sybase is another one, but we don't support that dialect yet.
24
+ ],
25
+ )
26
+ and identifier.name.startswith("#")
18
27
  )
19
28
 
20
29
 
@@ -0,0 +1,163 @@
1
+ import re
2
+ from enum import Enum
3
+ from typing import Generator, List, Tuple
4
+
5
+ CONTROL_FLOW_KEYWORDS = [
6
+ "GO",
7
+ r"BEGIN\w+TRY",
8
+ r"BEGIN\w+CATCH",
9
+ "BEGIN",
10
+ r"END\w+TRY",
11
+ r"END\w+CATCH",
12
+ "END",
13
+ ]
14
+
15
+ # There's an exception to this rule, which is when the statement
16
+ # is preceeded by a CTE.
17
+ FORCE_NEW_STATEMENT_KEYWORDS = [
18
+ # SELECT is used inside queries as well, so we can't include it here.
19
+ "INSERT",
20
+ "UPDATE",
21
+ "DELETE",
22
+ "MERGE",
23
+ ]
24
+
25
+
26
+ class ParserState(Enum):
27
+ NORMAL = 1
28
+ STRING = 2
29
+ COMMENT = 3
30
+ MULTILINE_COMMENT = 4
31
+
32
+
33
+ def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
34
+ """
35
+ Check if a keyword exists at the given position using regex word boundaries.
36
+ """
37
+ if pos + len(keyword) > len(sql):
38
+ return False
39
+
40
+ # If we're not at a word boundary, we can't generate a keyword.
41
+ if pos > 0 and not (
42
+ bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
43
+ or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
44
+ ):
45
+ return False
46
+
47
+ pattern = rf"^{re.escape(keyword)}\b"
48
+ match = re.match(pattern, sql[pos:], re.IGNORECASE)
49
+ return bool(match)
50
+
51
+
52
+ def _look_ahead_for_keywords(
53
+ sql: str, pos: int, keywords: List[str]
54
+ ) -> Tuple[bool, str, int]:
55
+ """
56
+ Look ahead for SQL keywords at the current position.
57
+ """
58
+
59
+ for keyword in keywords:
60
+ if _is_keyword_at_position(sql, pos, keyword):
61
+ return True, keyword, len(keyword)
62
+ return False, "", 0
63
+
64
+
65
+ def split_statements(sql: str) -> Generator[str, None, None]:
66
+ """
67
+ Split T-SQL code into individual statements, handling various SQL constructs.
68
+ """
69
+ if not sql or not sql.strip():
70
+ return
71
+
72
+ current_statement: List[str] = []
73
+ state = ParserState.NORMAL
74
+ i = 0
75
+
76
+ def yield_if_complete() -> Generator[str, None, None]:
77
+ statement = "".join(current_statement).strip()
78
+ if statement:
79
+ yield statement
80
+ current_statement.clear()
81
+
82
+ prev_real_char = "\0" # the most recent non-whitespace, non-comment character
83
+ while i < len(sql):
84
+ c = sql[i]
85
+ next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
86
+
87
+ if state == ParserState.NORMAL:
88
+ if c == "'":
89
+ state = ParserState.STRING
90
+ current_statement.append(c)
91
+ prev_real_char = c
92
+ elif c == "-" and next_char == "-":
93
+ state = ParserState.COMMENT
94
+ current_statement.append(c)
95
+ current_statement.append(next_char)
96
+ i += 1
97
+ elif c == "/" and next_char == "*":
98
+ state = ParserState.MULTILINE_COMMENT
99
+ current_statement.append(c)
100
+ current_statement.append(next_char)
101
+ i += 1
102
+ else:
103
+ most_recent_real_char = prev_real_char
104
+ if not c.isspace():
105
+ prev_real_char = c
106
+
107
+ is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
108
+ sql, i, keywords=CONTROL_FLOW_KEYWORDS
109
+ )
110
+ if is_control_keyword:
111
+ # Yield current statement if any
112
+ yield from yield_if_complete()
113
+ # Yield keyword as its own statement
114
+ yield keyword
115
+ i += keyword_len
116
+ continue
117
+
118
+ (
119
+ is_force_new_statement_keyword,
120
+ keyword,
121
+ keyword_len,
122
+ ) = _look_ahead_for_keywords(
123
+ sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
124
+ )
125
+ if (
126
+ is_force_new_statement_keyword and most_recent_real_char != ")"
127
+ ): # usually we'd have a close paren that closes a CTE
128
+ # Force termination of current statement
129
+ yield from yield_if_complete()
130
+
131
+ current_statement.append(keyword)
132
+ i += keyword_len
133
+ continue
134
+
135
+ elif c == ";":
136
+ yield from yield_if_complete()
137
+ else:
138
+ current_statement.append(c)
139
+
140
+ elif state == ParserState.STRING:
141
+ current_statement.append(c)
142
+ if c == "'" and next_char == "'":
143
+ current_statement.append(next_char)
144
+ i += 1
145
+ elif c == "'":
146
+ state = ParserState.NORMAL
147
+
148
+ elif state == ParserState.COMMENT:
149
+ current_statement.append(c)
150
+ if c == "\n":
151
+ state = ParserState.NORMAL
152
+
153
+ elif state == ParserState.MULTILINE_COMMENT:
154
+ current_statement.append(c)
155
+ if c == "*" and next_char == "/":
156
+ current_statement.append(next_char)
157
+ i += 1
158
+ state = ParserState.NORMAL
159
+
160
+ i += 1
161
+
162
+ # Handle the last statement
163
+ yield from yield_if_complete()
@@ -762,7 +762,6 @@ class SqlParsingAggregator(Closeable):
762
762
 
763
763
  This assumes that queries come in order of increasing timestamps.
764
764
  """
765
-
766
765
  self.report.num_observed_queries += 1
767
766
 
768
767
  # All queries with no session ID are assumed to be part of the same session.