acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,12 @@
1
+ import doctest
2
+ from types import ModuleType
3
+
4
+
5
+ def assert_doctest(module: ModuleType) -> None:
6
+ result = doctest.testmod(
7
+ module,
8
+ raise_on_error=True,
9
+ verbose=True,
10
+ )
11
+ if result.attempted == 0:
12
+ raise ValueError(f"No doctests found in {module.__name__}")
@@ -1,6 +1,7 @@
1
1
  import collections
2
2
  import gzip
3
3
  import logging
4
+ import os
4
5
  import pathlib
5
6
  import pickle
6
7
  import shutil
@@ -33,6 +34,14 @@ from datahub.ingestion.api.closeable import Closeable
33
34
 
34
35
  logger: logging.Logger = logging.getLogger(__name__)
35
36
 
37
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
38
+ os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
39
+ )
40
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
41
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
42
+ and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
43
+ )
44
+
36
45
  _DEFAULT_FILE_NAME = "sqlite.db"
37
46
  _DEFAULT_TABLE_NAME = "data"
38
47
 
@@ -212,6 +221,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
212
221
  _active_object_cache: OrderedDict[str, Tuple[_VT, bool]] = field(
213
222
  init=False, repr=False
214
223
  )
224
+ _use_sqlite_on_conflict: bool = field(repr=False, default=True)
215
225
 
216
226
  def __post_init__(self) -> None:
217
227
  assert (
@@ -232,7 +242,10 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
232
242
  # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
233
243
  # This was added in 3.24.0 from 2018-06-04.
234
244
  # See https://www.sqlite.org/lang_conflict.html
235
- raise RuntimeError("SQLite version 3.24.0 or later is required")
245
+ if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
246
+ self.use_sqlite_on_conflict = False
247
+ else:
248
+ raise RuntimeError("SQLite version 3.24.0 or later is required")
236
249
 
237
250
  # We keep a small cache in memory to avoid having to serialize/deserialize
238
251
  # data from the database too often. We use an OrderedDict to build
@@ -295,7 +308,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
295
308
  values.append(column_serializer(value))
296
309
  items_to_write.append(tuple(values))
297
310
 
298
- if items_to_write:
311
+ if items_to_write and self._use_sqlite_on_conflict:
299
312
  # Tricky: By using a INSERT INTO ... ON CONFLICT (key) structure, we can
300
313
  # ensure that the rowid remains the same if a value is updated but is
301
314
  # autoincremented when rows are inserted.
@@ -312,6 +325,26 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
312
325
  """,
313
326
  items_to_write,
314
327
  )
328
+ else:
329
+ for item in items_to_write:
330
+ try:
331
+ self._conn.execute(
332
+ f"""INSERT INTO {self.tablename} (
333
+ key,
334
+ value
335
+ {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
336
+ )
337
+ VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""",
338
+ item,
339
+ )
340
+ except sqlite3.IntegrityError:
341
+ self._conn.execute(
342
+ f"""UPDATE {self.tablename} SET
343
+ value = ?
344
+ {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())}
345
+ WHERE key = ?""",
346
+ (*item[1:], item[0]),
347
+ )
315
348
 
316
349
  def flush(self) -> None:
317
350
  self._prune_cache(len(self._active_object_cache))
@@ -268,7 +268,7 @@ class BatchPartitionExecutor(Closeable):
268
268
  self.process_batch = process_batch
269
269
  self.min_process_interval = min_process_interval
270
270
  self.read_from_pending_interval = read_from_pending_interval
271
- assert self.max_workers > 1
271
+ assert self.max_workers >= 1
272
272
 
273
273
  self._state_lock = threading.Lock()
274
274
  self._executor = ThreadPoolExecutor(
@@ -4,7 +4,8 @@ from typing import List
4
4
  # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage.
5
5
  # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts
6
6
  # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes.
7
- RESERVED_CHARS = {",", "(", ")"}
7
+ # Also see https://datahubproject.io/docs/what/urn/#restrictions
8
+ RESERVED_CHARS = {",", "(", ")", "␟"}
8
9
  RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"})
9
10
 
10
11
 
@@ -200,7 +200,7 @@ class Urn:
200
200
  @classmethod
201
201
  @deprecated(reason="no longer needed")
202
202
  def validate(cls, urn_str: str) -> None:
203
- Urn.create_from_string(urn_str)
203
+ Urn.from_string(urn_str)
204
204
 
205
205
  @staticmethod
206
206
  def url_encode(urn: str) -> str:
@@ -4,4 +4,4 @@ __all__ = ["StructuredPropertyUrn", "make_structured_property_urn"]
4
4
 
5
5
 
6
6
  def make_structured_property_urn(structured_property_id: str) -> str:
7
- return str(StructuredPropertyUrn.create_from_string(structured_property_id))
7
+ return str(StructuredPropertyUrn.from_string(structured_property_id))
@@ -1,160 +0,0 @@
1
- import contextlib
2
- import logging
3
- import re
4
- import unittest
5
- import unittest.mock
6
- from typing import Dict, List, Optional, Set
7
-
8
- from sqllineage.core.holders import Column, SQLLineageHolder
9
- from sqllineage.exceptions import SQLLineageException
10
-
11
- from datahub.utilities.sql_parser_base import SQLParser, SqlParserException
12
-
13
- with contextlib.suppress(ImportError):
14
- import sqlparse
15
- from networkx import DiGraph
16
- from sqllineage.core import LineageAnalyzer
17
-
18
- import datahub.utilities.sqllineage_patch
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- class SqlLineageSQLParserImpl(SQLParser):
23
- _DATE_SWAP_TOKEN = "__d_a_t_e"
24
- _HOUR_SWAP_TOKEN = "__h_o_u_r"
25
- _TIMESTAMP_SWAP_TOKEN = "__t_i_m_e_s_t_a_m_p"
26
- _DATA_SWAP_TOKEN = "__d_a_t_a"
27
- _ADMIN_SWAP_TOKEN = "__a_d_m_i_n"
28
- _MYVIEW_SQL_TABLE_NAME_TOKEN = "__my_view__.__sql_table_name__"
29
- _MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME"
30
-
31
- def __init__(self, sql_query: str, use_raw_names: bool = False) -> None:
32
- super().__init__(sql_query)
33
- original_sql_query = sql_query
34
- self._use_raw_names = use_raw_names
35
-
36
- # SqlLineageParser makes mistakes on lateral flatten queries, use the prefix
37
- if "lateral flatten" in sql_query:
38
- sql_query = sql_query[: sql_query.find("lateral flatten")]
39
-
40
- # Replace reserved words that break SqlLineageParser
41
- self.token_to_original: Dict[str, str] = {
42
- self._DATE_SWAP_TOKEN: "date",
43
- self._HOUR_SWAP_TOKEN: "hour",
44
- self._TIMESTAMP_SWAP_TOKEN: "timestamp",
45
- self._DATA_SWAP_TOKEN: "data",
46
- self._ADMIN_SWAP_TOKEN: "admin",
47
- }
48
- for replacement, original in self.token_to_original.items():
49
- # Replace original tokens with replacement. Since table and column name can contain a hyphen('-'),
50
- # also prevent original tokens appearing as part of these names with a hyphen from getting substituted.
51
- sql_query = re.sub(
52
- rf"((?<!-)\b{original}\b)(?!-)",
53
- rf"{replacement}",
54
- sql_query,
55
- flags=re.IGNORECASE,
56
- )
57
-
58
- # SqlLineageParser lowercarese tablenames and we need to replace Looker specific token which should be uppercased
59
- sql_query = re.sub(
60
- rf"(\${{{self._MYVIEW_LOOKER_TOKEN}}})",
61
- rf"{self._MYVIEW_SQL_TABLE_NAME_TOKEN}",
62
- sql_query,
63
- )
64
-
65
- # SqlLineageParser does not handle "encode" directives well. Remove them
66
- sql_query = re.sub(r"\sencode [a-zA-Z]*", "", sql_query, flags=re.IGNORECASE)
67
-
68
- # Replace lookml templates with the variable otherwise sqlparse can't parse ${
69
- sql_query = re.sub(r"(\${)(.+)(})", r"\2", sql_query)
70
- if sql_query != original_sql_query:
71
- logger.debug(f"Rewrote original query {original_sql_query} as {sql_query}")
72
-
73
- self._sql = sql_query
74
- self._stmt_holders: Optional[List[LineageAnalyzer]] = None
75
- self._sql_holder: Optional[SQLLineageHolder] = None
76
- try:
77
- self._stmt = [
78
- s
79
- for s in sqlparse.parse(
80
- # first apply sqlparser formatting just to get rid of comments, which cause
81
- # inconsistencies in parsing output
82
- sqlparse.format(
83
- self._sql.strip(),
84
- strip_comments=True,
85
- use_space_around_operators=True,
86
- ),
87
- )
88
- if s.token_first(skip_cm=True)
89
- ]
90
-
91
- with unittest.mock.patch(
92
- "sqllineage.core.handlers.source.SourceHandler.end_of_query_cleanup",
93
- datahub.utilities.sqllineage_patch.end_of_query_cleanup_patch,
94
- ):
95
- with unittest.mock.patch(
96
- "sqllineage.core.holders.SubQueryLineageHolder.add_column_lineage",
97
- datahub.utilities.sqllineage_patch.add_column_lineage_patch,
98
- ):
99
- self._stmt_holders = [
100
- LineageAnalyzer().analyze(stmt) for stmt in self._stmt
101
- ]
102
- self._sql_holder = SQLLineageHolder.of(*self._stmt_holders)
103
- except SQLLineageException as e:
104
- raise SqlParserException(
105
- f"SQL lineage analyzer error '{e}' for query: '{self._sql}"
106
- ) from e
107
-
108
- def get_tables(self) -> List[str]:
109
- result: List[str] = []
110
- if self._sql_holder is None:
111
- logger.error("sql holder not present so cannot get tables")
112
- return result
113
- for table in self._sql_holder.source_tables:
114
- table_normalized = re.sub(
115
- r"^<default>.",
116
- "",
117
- (
118
- str(table)
119
- if not self._use_raw_names
120
- else f"{table.schema.raw_name}.{table.raw_name}"
121
- ),
122
- )
123
- result.append(str(table_normalized))
124
-
125
- # We need to revert TOKEN replacements
126
- for token, replacement in self.token_to_original.items():
127
- result = [replacement if c == token else c for c in result]
128
- result = [
129
- self._MYVIEW_LOOKER_TOKEN if c == self._MYVIEW_SQL_TABLE_NAME_TOKEN else c
130
- for c in result
131
- ]
132
-
133
- # Sort tables to make the list deterministic
134
- result.sort()
135
-
136
- return result
137
-
138
- def get_columns(self) -> List[str]:
139
- if self._sql_holder is None:
140
- raise SqlParserException("sql holder not present so cannot get columns")
141
- graph: DiGraph = self._sql_holder.graph # For mypy attribute checking
142
- column_nodes = [n for n in graph.nodes if isinstance(n, Column)]
143
- column_graph = graph.subgraph(column_nodes)
144
-
145
- target_columns = {column for column, deg in column_graph.out_degree if deg == 0}
146
-
147
- result: Set[str] = set()
148
- for column in target_columns:
149
- # Let's drop all the count(*) and similard columns which are expression actually if it does not have an alias
150
- if not any(ele in column.raw_name for ele in ["*", "(", ")"]):
151
- result.add(str(column.raw_name))
152
-
153
- # Reverting back all the previously renamed words which confuses the parser
154
- result = {"date" if c == self._DATE_SWAP_TOKEN else c for c in result}
155
- result = {
156
- "timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result)
157
- }
158
-
159
- # swap back renamed date column
160
- return list(result)
@@ -1,94 +0,0 @@
1
- import logging
2
- import multiprocessing
3
- import traceback
4
- from multiprocessing import Process, Queue
5
- from typing import Any, List, Optional, Tuple
6
-
7
- from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl
8
- from datahub.utilities.sql_parser_base import SQLParser
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- def sql_lineage_parser_impl_func_wrapper(
14
- queue: Optional[multiprocessing.Queue], sql_query: str, use_raw_names: bool = False
15
- ) -> Optional[Tuple[List[str], List[str], Any]]:
16
- """
17
- The wrapper function that computes the tables and columns using the SqlLineageSQLParserImpl
18
- and puts the results on the shared IPC queue. This is used to isolate SqlLineageSQLParserImpl
19
- functionality in a separate process, and hence protect our sources from memory leaks originating in
20
- the sqllineage module.
21
- :param queue: The shared IPC queue on to which the results will be put.
22
- :param sql_query: The SQL query to extract the tables & columns from.
23
- :param use_raw_names: Parameter used to ignore sqllineage's default lowercasing.
24
- :return: None.
25
- """
26
- exception_details: Optional[Tuple[BaseException, str]] = None
27
- tables: List[str] = []
28
- columns: List[str] = []
29
- try:
30
- parser = SqlLineageSQLParserImpl(sql_query, use_raw_names)
31
- tables = parser.get_tables()
32
- columns = parser.get_columns()
33
- except BaseException as e:
34
- exc_msg = traceback.format_exc()
35
- exception_details = (e, exc_msg)
36
- logger.debug(exc_msg)
37
-
38
- if queue is not None:
39
- queue.put((tables, columns, exception_details))
40
- return None
41
- else:
42
- return (tables, columns, exception_details)
43
-
44
-
45
- class SqlLineageSQLParser(SQLParser):
46
- def __init__(
47
- self,
48
- sql_query: str,
49
- use_external_process: bool = False,
50
- use_raw_names: bool = False,
51
- ) -> None:
52
- super().__init__(sql_query, use_external_process)
53
- if use_external_process:
54
- self.tables, self.columns = self._get_tables_columns_process_wrapped(
55
- sql_query, use_raw_names
56
- )
57
- else:
58
- return_tuple = sql_lineage_parser_impl_func_wrapper(
59
- None, sql_query, use_raw_names
60
- )
61
- if return_tuple is not None:
62
- (
63
- self.tables,
64
- self.columns,
65
- some_exception,
66
- ) = return_tuple
67
-
68
- @staticmethod
69
- def _get_tables_columns_process_wrapped(
70
- sql_query: str, use_raw_names: bool = False
71
- ) -> Tuple[List[str], List[str]]:
72
- # Invoke sql_lineage_parser_impl_func_wrapper in a separate process to avoid
73
- # memory leaks from sqllineage module used by SqlLineageSQLParserImpl. This will help
74
- # shield our sources like lookml & redash, that need to parse a large number of SQL statements,
75
- # from causing significant memory leaks in the datahub cli during ingestion.
76
- queue: multiprocessing.Queue = Queue()
77
- process: multiprocessing.Process = Process(
78
- target=sql_lineage_parser_impl_func_wrapper,
79
- args=(queue, sql_query, use_raw_names),
80
- )
81
- process.start()
82
- tables, columns, exception_details = queue.get(block=True)
83
- if exception_details is not None:
84
- raise exception_details[0](f"Sub-process exception: {exception_details[1]}")
85
- return tables, columns
86
-
87
- def get_tables(self) -> List[str]:
88
- return self.tables
89
-
90
- def get_columns(self) -> List[str]:
91
- return self.columns
92
-
93
-
94
- DefaultSQLParser = SqlLineageSQLParser
@@ -1,21 +0,0 @@
1
- from abc import ABCMeta, abstractmethod
2
- from typing import List
3
-
4
-
5
- class SqlParserException(Exception):
6
- """Raised when sql parser fails"""
7
-
8
- pass
9
-
10
-
11
- class SQLParser(metaclass=ABCMeta):
12
- def __init__(self, sql_query: str, use_external_process: bool = True) -> None:
13
- self._sql_query = sql_query
14
-
15
- @abstractmethod
16
- def get_tables(self) -> List[str]:
17
- pass
18
-
19
- @abstractmethod
20
- def get_columns(self) -> List[str]:
21
- pass