acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,26 +1,48 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Iterator, List, Tuple
|
|
4
|
+
|
|
5
|
+
SELECT_KEYWORD = "SELECT"
|
|
6
|
+
CASE_KEYWORD = "CASE"
|
|
7
|
+
END_KEYWORD = "END"
|
|
4
8
|
|
|
5
9
|
CONTROL_FLOW_KEYWORDS = [
|
|
6
10
|
"GO",
|
|
7
|
-
r"BEGIN\
|
|
8
|
-
r"BEGIN\
|
|
11
|
+
r"BEGIN\s+TRY",
|
|
12
|
+
r"BEGIN\s+CATCH",
|
|
9
13
|
"BEGIN",
|
|
10
|
-
r"END\
|
|
11
|
-
r"END\
|
|
12
|
-
|
|
14
|
+
r"END\s+TRY",
|
|
15
|
+
r"END\s+CATCH",
|
|
16
|
+
# This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
|
|
17
|
+
# This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
|
|
18
|
+
"IF",
|
|
19
|
+
# For things like CASE, END does not mean the end of a statement.
|
|
20
|
+
# We have special handling for this.
|
|
21
|
+
END_KEYWORD,
|
|
22
|
+
# "ELSE", # else is also valid in CASE, so we we can't use it here.
|
|
13
23
|
]
|
|
14
24
|
|
|
15
25
|
# There's an exception to this rule, which is when the statement
|
|
16
|
-
# is
|
|
17
|
-
|
|
26
|
+
# is preceded by a CTE. For those, we have to check if the character
|
|
27
|
+
# before this is a ")".
|
|
28
|
+
NEW_STATEMENT_KEYWORDS = [
|
|
18
29
|
# SELECT is used inside queries as well, so we can't include it here.
|
|
30
|
+
"CREATE",
|
|
19
31
|
"INSERT",
|
|
20
32
|
"UPDATE",
|
|
21
33
|
"DELETE",
|
|
22
34
|
"MERGE",
|
|
23
35
|
]
|
|
36
|
+
STRICT_NEW_STATEMENT_KEYWORDS = [
|
|
37
|
+
# For these keywords, a SELECT following it does indicate a new statement.
|
|
38
|
+
"DROP",
|
|
39
|
+
"TRUNCATE",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class _AlreadyIncremented(Exception):
|
|
44
|
+
# Using exceptions for control flow isn't great - but the code is clearer so it's fine.
|
|
45
|
+
pass
|
|
24
46
|
|
|
25
47
|
|
|
26
48
|
class ParserState(Enum):
|
|
@@ -30,134 +52,206 @@ class ParserState(Enum):
|
|
|
30
52
|
MULTILINE_COMMENT = 4
|
|
31
53
|
|
|
32
54
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
"""
|
|
37
|
-
if pos + len(keyword) > len(sql):
|
|
38
|
-
return False
|
|
55
|
+
class _StatementSplitter:
|
|
56
|
+
def __init__(self, sql: str):
|
|
57
|
+
self.sql = sql
|
|
39
58
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
):
|
|
45
|
-
return False
|
|
59
|
+
# Main parser state.
|
|
60
|
+
self.i = 0
|
|
61
|
+
self.state = ParserState.NORMAL
|
|
62
|
+
self.current_statement: List[str] = []
|
|
46
63
|
|
|
47
|
-
|
|
48
|
-
match = re.match(pattern, sql[pos:], re.IGNORECASE)
|
|
49
|
-
return bool(match)
|
|
64
|
+
# Additional parser state.
|
|
50
65
|
|
|
66
|
+
# If we see a SELECT, should we start a new statement?
|
|
67
|
+
# If we previously saw a drop/truncate/etc, a SELECT does mean a new statement.
|
|
68
|
+
# But if we're in a select/create/etc, a select could just be a subquery.
|
|
69
|
+
self.does_select_mean_new_statement = False
|
|
51
70
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Look ahead for SQL keywords at the current position.
|
|
57
|
-
"""
|
|
71
|
+
# The END keyword terminates CASE and BEGIN blocks.
|
|
72
|
+
# We need to match the CASE statements with END blocks to determine
|
|
73
|
+
# what a given END is closing.
|
|
74
|
+
self.current_case_statements = 0
|
|
58
75
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
76
|
+
def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
|
|
77
|
+
"""
|
|
78
|
+
Check if a keyword exists at the given position using regex word boundaries.
|
|
79
|
+
"""
|
|
80
|
+
sql = self.sql
|
|
63
81
|
|
|
82
|
+
keyword_length = len(keyword.replace(r"\s+", " "))
|
|
64
83
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
84
|
+
if pos + keyword_length > len(sql):
|
|
85
|
+
return False, ""
|
|
86
|
+
|
|
87
|
+
# If we're not at a word boundary, we can't generate a keyword.
|
|
88
|
+
if pos > 0 and not (
|
|
89
|
+
bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
|
|
90
|
+
or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
|
|
91
|
+
):
|
|
92
|
+
return False, ""
|
|
93
|
+
|
|
94
|
+
pattern = rf"^{keyword}\b"
|
|
95
|
+
match = re.match(pattern, sql[pos:], re.IGNORECASE)
|
|
96
|
+
is_match = bool(match)
|
|
97
|
+
actual_match = (
|
|
98
|
+
sql[pos:][match.start() : match.end()] if match is not None else ""
|
|
99
|
+
)
|
|
100
|
+
return is_match, actual_match
|
|
71
101
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
102
|
+
def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
|
|
103
|
+
"""
|
|
104
|
+
Look ahead for SQL keywords at the current position.
|
|
105
|
+
"""
|
|
75
106
|
|
|
76
|
-
|
|
77
|
-
|
|
107
|
+
for keyword in keywords:
|
|
108
|
+
is_match, keyword = self._is_keyword_at_position(self.i, keyword)
|
|
109
|
+
if is_match:
|
|
110
|
+
return True, keyword, len(keyword)
|
|
111
|
+
return False, "", 0
|
|
112
|
+
|
|
113
|
+
def _yield_if_complete(self) -> Iterator[str]:
|
|
114
|
+
statement = "".join(self.current_statement).strip()
|
|
78
115
|
if statement:
|
|
116
|
+
# Subtle - to avoid losing full whitespace, they get merged into the next statement.
|
|
79
117
|
yield statement
|
|
80
|
-
current_statement.clear()
|
|
81
|
-
|
|
82
|
-
prev_real_char = "\0" # the most recent non-whitespace, non-comment character
|
|
83
|
-
while i < len(sql):
|
|
84
|
-
c = sql[i]
|
|
85
|
-
next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
|
|
86
|
-
|
|
87
|
-
if state == ParserState.NORMAL:
|
|
88
|
-
if c == "'":
|
|
89
|
-
state = ParserState.STRING
|
|
90
|
-
current_statement.append(c)
|
|
91
|
-
prev_real_char = c
|
|
92
|
-
elif c == "-" and next_char == "-":
|
|
93
|
-
state = ParserState.COMMENT
|
|
94
|
-
current_statement.append(c)
|
|
95
|
-
current_statement.append(next_char)
|
|
96
|
-
i += 1
|
|
97
|
-
elif c == "/" and next_char == "*":
|
|
98
|
-
state = ParserState.MULTILINE_COMMENT
|
|
99
|
-
current_statement.append(c)
|
|
100
|
-
current_statement.append(next_char)
|
|
101
|
-
i += 1
|
|
102
|
-
else:
|
|
103
|
-
most_recent_real_char = prev_real_char
|
|
104
|
-
if not c.isspace():
|
|
105
|
-
prev_real_char = c
|
|
118
|
+
self.current_statement.clear()
|
|
106
119
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
120
|
+
# Reset current_statement-specific state.
|
|
121
|
+
self.does_select_mean_new_statement = False
|
|
122
|
+
if self.current_case_statements != 0:
|
|
123
|
+
breakpoint()
|
|
124
|
+
self.current_case_statements = 0
|
|
125
|
+
|
|
126
|
+
def process(self) -> Iterator[str]:
|
|
127
|
+
if not self.sql or not self.sql.strip():
|
|
128
|
+
yield from ()
|
|
129
|
+
|
|
130
|
+
prev_real_char = "\0" # the most recent non-whitespace, non-comment character
|
|
131
|
+
while self.i < len(self.sql):
|
|
132
|
+
c = self.sql[self.i]
|
|
133
|
+
next_char = self.sql[self.i + 1] if self.i < len(self.sql) - 1 else "\0"
|
|
134
|
+
|
|
135
|
+
if self.state == ParserState.NORMAL:
|
|
136
|
+
if c == "'":
|
|
137
|
+
self.state = ParserState.STRING
|
|
138
|
+
self.current_statement.append(c)
|
|
139
|
+
prev_real_char = c
|
|
140
|
+
elif c == "-" and next_char == "-":
|
|
141
|
+
self.state = ParserState.COMMENT
|
|
142
|
+
self.current_statement.append(c)
|
|
143
|
+
self.current_statement.append(next_char)
|
|
144
|
+
self.i += 1
|
|
145
|
+
elif c == "/" and next_char == "*":
|
|
146
|
+
self.state = ParserState.MULTILINE_COMMENT
|
|
147
|
+
self.current_statement.append(c)
|
|
148
|
+
self.current_statement.append(next_char)
|
|
149
|
+
self.i += 1
|
|
137
150
|
else:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
151
|
+
most_recent_real_char = prev_real_char
|
|
152
|
+
if not c.isspace():
|
|
153
|
+
prev_real_char = c
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
yield from self._process_normal(
|
|
157
|
+
most_recent_real_char=most_recent_real_char
|
|
158
|
+
)
|
|
159
|
+
except _AlreadyIncremented:
|
|
160
|
+
# Skip the normal i += 1 step.
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
elif self.state == ParserState.STRING:
|
|
164
|
+
self.current_statement.append(c)
|
|
165
|
+
if c == "'" and next_char == "'":
|
|
166
|
+
self.current_statement.append(next_char)
|
|
167
|
+
self.i += 1
|
|
168
|
+
elif c == "'":
|
|
169
|
+
self.state = ParserState.NORMAL
|
|
170
|
+
|
|
171
|
+
elif self.state == ParserState.COMMENT:
|
|
172
|
+
self.current_statement.append(c)
|
|
173
|
+
if c == "\n":
|
|
174
|
+
self.state = ParserState.NORMAL
|
|
175
|
+
|
|
176
|
+
elif self.state == ParserState.MULTILINE_COMMENT:
|
|
177
|
+
self.current_statement.append(c)
|
|
178
|
+
if c == "*" and next_char == "/":
|
|
179
|
+
self.current_statement.append(next_char)
|
|
180
|
+
self.i += 1
|
|
181
|
+
self.state = ParserState.NORMAL
|
|
182
|
+
|
|
183
|
+
self.i += 1
|
|
184
|
+
|
|
185
|
+
# Handle the last statement
|
|
186
|
+
yield from self._yield_if_complete()
|
|
187
|
+
|
|
188
|
+
def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
|
|
189
|
+
c = self.sql[self.i]
|
|
190
|
+
|
|
191
|
+
if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
|
|
192
|
+
self.current_case_statements += 1
|
|
193
|
+
|
|
194
|
+
is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
|
|
195
|
+
keywords=CONTROL_FLOW_KEYWORDS
|
|
196
|
+
)
|
|
197
|
+
if (
|
|
198
|
+
is_control_keyword
|
|
199
|
+
and keyword == END_KEYWORD
|
|
200
|
+
and self.current_case_statements > 0
|
|
201
|
+
):
|
|
202
|
+
# If we're closing a CASE statement with END, we can just decrement the counter and continue.
|
|
203
|
+
self.current_case_statements -= 1
|
|
204
|
+
elif is_control_keyword:
|
|
205
|
+
# Yield current statement if any
|
|
206
|
+
yield from self._yield_if_complete()
|
|
207
|
+
# Yield keyword as its own statement
|
|
208
|
+
yield keyword
|
|
209
|
+
self.i += keyword_len
|
|
210
|
+
self.does_select_mean_new_statement = True
|
|
211
|
+
raise _AlreadyIncremented()
|
|
212
|
+
|
|
213
|
+
(
|
|
214
|
+
is_strict_new_statement_keyword,
|
|
215
|
+
keyword,
|
|
216
|
+
keyword_len,
|
|
217
|
+
) = self._look_ahead_for_keywords(keywords=STRICT_NEW_STATEMENT_KEYWORDS)
|
|
218
|
+
if is_strict_new_statement_keyword:
|
|
219
|
+
yield from self._yield_if_complete()
|
|
220
|
+
self.current_statement.append(keyword)
|
|
221
|
+
self.i += keyword_len
|
|
222
|
+
self.does_select_mean_new_statement = True
|
|
223
|
+
raise _AlreadyIncremented()
|
|
224
|
+
|
|
225
|
+
(
|
|
226
|
+
is_force_new_statement_keyword,
|
|
227
|
+
keyword,
|
|
228
|
+
keyword_len,
|
|
229
|
+
) = self._look_ahead_for_keywords(
|
|
230
|
+
keywords=(
|
|
231
|
+
NEW_STATEMENT_KEYWORDS
|
|
232
|
+
+ ([SELECT_KEYWORD] if self.does_select_mean_new_statement else [])
|
|
233
|
+
),
|
|
234
|
+
)
|
|
235
|
+
if (
|
|
236
|
+
is_force_new_statement_keyword and most_recent_real_char != ")"
|
|
237
|
+
): # usually we'd have a close paren that closes a CTE
|
|
238
|
+
# Force termination of current statement
|
|
239
|
+
yield from self._yield_if_complete()
|
|
240
|
+
|
|
241
|
+
self.current_statement.append(keyword)
|
|
242
|
+
self.i += keyword_len
|
|
243
|
+
raise _AlreadyIncremented()
|
|
244
|
+
|
|
245
|
+
if c == ";":
|
|
246
|
+
yield from self._yield_if_complete()
|
|
247
|
+
else:
|
|
248
|
+
self.current_statement.append(c)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def split_statements(sql: str) -> Iterator[str]:
|
|
252
|
+
"""
|
|
253
|
+
Split T-SQL code into individual statements, handling various SQL constructs.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
splitter = _StatementSplitter(sql)
|
|
257
|
+
yield from splitter.process()
|
|
@@ -24,12 +24,19 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
|
|
|
24
24
|
# For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
|
|
25
25
|
# are case preserving but case insensitive.
|
|
26
26
|
"mssql",
|
|
27
|
+
# Oracle automatically converts unquoted identifiers to uppercase.
|
|
28
|
+
# https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Database-Object-Names-and-Qualifiers.html#GUID-3C59E44A-5140-4BCA-B9E1-3039C8050C49
|
|
29
|
+
# In our Oracle connector, we then normalize column names to lowercase. This behavior
|
|
30
|
+
# actually comes from the underlying Oracle sqlalchemy dialect.
|
|
31
|
+
# https://github.com/sqlalchemy/sqlalchemy/blob/d9b4d8ff3aae504402d324f3ebf0b8faff78f5dc/lib/sqlalchemy/dialects/oracle/base.py#L2579
|
|
32
|
+
"oracle",
|
|
27
33
|
}
|
|
28
34
|
DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
|
|
29
35
|
# In some dialects, column identifiers are effectively case insensitive
|
|
30
36
|
# because they are automatically converted to uppercase. Most other systems
|
|
31
37
|
# automatically lowercase unquoted identifiers.
|
|
32
38
|
"snowflake",
|
|
39
|
+
"oracle",
|
|
33
40
|
}
|
|
34
41
|
assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
|
|
35
42
|
DIALECTS_WITH_CASE_INSENSITIVE_COLS
|
|
@@ -56,10 +56,7 @@ def get_dialect(platform: DialectOrStr) -> sqlglot.Dialect:
|
|
|
56
56
|
def is_dialect_instance(
|
|
57
57
|
dialect: sqlglot.Dialect, platforms: Union[str, Iterable[str]]
|
|
58
58
|
) -> bool:
|
|
59
|
-
if isinstance(platforms, str)
|
|
60
|
-
platforms = [platforms]
|
|
61
|
-
else:
|
|
62
|
-
platforms = list(platforms)
|
|
59
|
+
platforms = [platforms] if isinstance(platforms, str) else list(platforms)
|
|
63
60
|
|
|
64
61
|
dialects = [get_dialect(platform) for platform in platforms]
|
|
65
62
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os
|
|
3
2
|
import pathlib
|
|
4
3
|
from typing import Any, Dict, Optional
|
|
5
4
|
|
|
@@ -8,11 +7,10 @@ import deepdiff
|
|
|
8
7
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
|
9
8
|
from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
|
|
10
9
|
from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage
|
|
10
|
+
from datahub.testing.pytest_hooks import get_golden_settings
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
-
UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true"
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
def assert_sql_result_with_resolver(
|
|
18
16
|
sql: str,
|
|
@@ -22,6 +20,8 @@ def assert_sql_result_with_resolver(
|
|
|
22
20
|
allow_table_error: bool = False,
|
|
23
21
|
**kwargs: Any,
|
|
24
22
|
) -> None:
|
|
23
|
+
settings = get_golden_settings()
|
|
24
|
+
|
|
25
25
|
# HACK: Our BigQuery source overwrites this value and doesn't undo it.
|
|
26
26
|
# As such, we need to handle that here.
|
|
27
27
|
BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd"
|
|
@@ -47,15 +47,14 @@ def assert_sql_result_with_resolver(
|
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
txt = res.json(indent=4)
|
|
50
|
-
if
|
|
50
|
+
if settings.update_golden:
|
|
51
51
|
expected_file.write_text(txt)
|
|
52
52
|
return
|
|
53
53
|
|
|
54
54
|
if not expected_file.exists():
|
|
55
55
|
expected_file.write_text(txt)
|
|
56
56
|
raise AssertionError(
|
|
57
|
-
f"
|
|
58
|
-
"Created it with the expected output. Please verify it."
|
|
57
|
+
f"Missing expected golden file; run with --update-golden-files to create it: {expected_file}"
|
|
59
58
|
)
|
|
60
59
|
|
|
61
60
|
expected = SqlParsingResult.parse_raw(expected_file.read_text())
|
|
@@ -16,6 +16,7 @@ from deepdiff import DeepDiff
|
|
|
16
16
|
from datahub.ingestion.sink.file import write_metadata_file
|
|
17
17
|
from datahub.ingestion.source.file import read_metadata_file
|
|
18
18
|
from datahub.testing.mcp_diff import CannotCompareMCPs, MCPDiff, get_aspects_by_urn
|
|
19
|
+
from datahub.testing.pytest_hooks import get_golden_settings
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -40,26 +41,26 @@ def load_json_file(filename: Union[str, os.PathLike]) -> MetadataJson:
|
|
|
40
41
|
def assert_metadata_files_equal(
|
|
41
42
|
output_path: Union[str, os.PathLike],
|
|
42
43
|
golden_path: Union[str, os.PathLike],
|
|
43
|
-
update_golden: bool,
|
|
44
|
-
copy_output: bool,
|
|
45
44
|
ignore_paths: Sequence[str] = (),
|
|
46
45
|
ignore_paths_v2: Sequence[str] = (),
|
|
47
46
|
ignore_order: bool = True,
|
|
48
47
|
) -> None:
|
|
48
|
+
settings = get_golden_settings()
|
|
49
|
+
|
|
49
50
|
golden_exists = os.path.isfile(golden_path)
|
|
50
51
|
|
|
51
|
-
if copy_output:
|
|
52
|
+
if settings.copy_output:
|
|
52
53
|
shutil.copyfile(str(output_path), str(golden_path) + ".output")
|
|
53
54
|
logger.info(f"Copied output file to {golden_path}.output")
|
|
54
55
|
|
|
55
|
-
if not update_golden and not golden_exists:
|
|
56
|
+
if not settings.update_golden and not golden_exists:
|
|
56
57
|
raise FileNotFoundError(
|
|
57
58
|
"Golden file does not exist. Please run with the --update-golden-files option to create."
|
|
58
59
|
)
|
|
59
60
|
|
|
60
61
|
output = load_json_file(output_path)
|
|
61
62
|
|
|
62
|
-
if update_golden and not golden_exists:
|
|
63
|
+
if settings.update_golden and not golden_exists:
|
|
63
64
|
shutil.copyfile(str(output_path), str(golden_path))
|
|
64
65
|
return
|
|
65
66
|
else:
|
|
@@ -87,7 +88,7 @@ def assert_metadata_files_equal(
|
|
|
87
88
|
ignore_paths = (*ignore_paths, *default_exclude_paths)
|
|
88
89
|
|
|
89
90
|
diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
|
|
90
|
-
if diff and update_golden:
|
|
91
|
+
if diff and settings.update_golden:
|
|
91
92
|
if isinstance(diff, MCPDiff) and diff.is_delta_valid:
|
|
92
93
|
logger.info(f"Applying delta to golden file {golden_path}")
|
|
93
94
|
diff.apply_delta(golden)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"load_golden_flags",
|
|
8
|
+
"get_golden_settings",
|
|
9
|
+
"pytest_addoption",
|
|
10
|
+
"GoldenFileSettings",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class GoldenFileSettings:
|
|
16
|
+
update_golden: bool
|
|
17
|
+
copy_output: bool
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_registered: bool = False
|
|
21
|
+
_settings: Optional[GoldenFileSettings] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def pytest_addoption(parser: pytest.Parser) -> None:
|
|
25
|
+
parser.addoption(
|
|
26
|
+
"--update-golden-files",
|
|
27
|
+
action="store_true",
|
|
28
|
+
default=False,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# TODO: Deprecate and remove this flag.
|
|
32
|
+
parser.addoption("--copy-output-files", action="store_true", default=False)
|
|
33
|
+
|
|
34
|
+
global _registered
|
|
35
|
+
_registered = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@pytest.fixture(scope="session", autouse=True)
|
|
39
|
+
def load_golden_flags(pytestconfig: pytest.Config) -> None:
|
|
40
|
+
global _settings
|
|
41
|
+
_settings = GoldenFileSettings(
|
|
42
|
+
update_golden=pytestconfig.getoption("--update-golden-files"),
|
|
43
|
+
copy_output=pytestconfig.getoption("--copy-output-files"),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_golden_settings() -> GoldenFileSettings:
|
|
48
|
+
if not _registered:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"Golden files aren't set up properly. Call register_golden_flags from a conftest pytest_addoptions method."
|
|
51
|
+
)
|
|
52
|
+
if not _settings:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"Golden files aren't set up properly. Ensure load_golden_flags is imported in your conftest."
|
|
55
|
+
)
|
|
56
|
+
return _settings
|
datahub/upgrade/upgrade.py
CHANGED
|
@@ -293,9 +293,9 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
|
|
|
293
293
|
return server.version.micro - client.version.micro
|
|
294
294
|
|
|
295
295
|
|
|
296
|
-
def _maybe_print_upgrade_message(
|
|
296
|
+
def _maybe_print_upgrade_message(
|
|
297
297
|
version_stats: Optional[DataHubVersionStats],
|
|
298
|
-
) -> None:
|
|
298
|
+
) -> None:
|
|
299
299
|
days_before_cli_stale = 7
|
|
300
300
|
days_before_quickstart_stale = 7
|
|
301
301
|
|
|
@@ -10,13 +10,11 @@ import tempfile
|
|
|
10
10
|
import threading
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from datetime import datetime
|
|
13
|
-
from enum import Enum
|
|
14
13
|
from types import TracebackType
|
|
15
14
|
from typing import (
|
|
16
15
|
Any,
|
|
17
16
|
Callable,
|
|
18
17
|
Dict,
|
|
19
|
-
Final,
|
|
20
18
|
Generic,
|
|
21
19
|
Iterator,
|
|
22
20
|
List,
|
|
@@ -31,6 +29,7 @@ from typing import (
|
|
|
31
29
|
)
|
|
32
30
|
|
|
33
31
|
from datahub.ingestion.api.closeable import Closeable
|
|
32
|
+
from datahub.utilities.sentinels import Unset, unset
|
|
34
33
|
|
|
35
34
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
36
35
|
|
|
@@ -59,16 +58,6 @@ SqliteValue = Union[int, float, str, bytes, datetime, None]
|
|
|
59
58
|
_VT = TypeVar("_VT")
|
|
60
59
|
|
|
61
60
|
|
|
62
|
-
class Unset(Enum):
|
|
63
|
-
token = 0
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
# It's pretty annoying to create a true sentinel that works with typing.
|
|
67
|
-
# https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
|
|
68
|
-
# Can't wait for https://peps.python.org/pep-0661/
|
|
69
|
-
_unset: Final = Unset.token
|
|
70
|
-
|
|
71
|
-
|
|
72
61
|
class ConnectionWrapper:
|
|
73
62
|
"""
|
|
74
63
|
Wraps a SQlite connection, allowing connection reuse across multiple FileBacked* objects.
|
|
@@ -372,7 +361,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
372
361
|
self,
|
|
373
362
|
/,
|
|
374
363
|
key: str,
|
|
375
|
-
default: Union[_VT, Unset] =
|
|
364
|
+
default: Union[_VT, Unset] = unset,
|
|
376
365
|
) -> _VT:
|
|
377
366
|
# If key is in the dictionary, this is similar to __getitem__ + mark_dirty.
|
|
378
367
|
# If key is not in the dictionary, this is similar to __setitem__.
|
|
@@ -383,7 +372,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
|
|
|
383
372
|
self.mark_dirty(key)
|
|
384
373
|
return value
|
|
385
374
|
except KeyError:
|
|
386
|
-
if default is
|
|
375
|
+
if default is unset:
|
|
387
376
|
raise
|
|
388
377
|
|
|
389
378
|
self[key] = default
|