acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,48 @@
1
1
  import re
2
2
  from enum import Enum
3
- from typing import Generator, List, Tuple
3
+ from typing import Iterator, List, Tuple
4
+
5
+ SELECT_KEYWORD = "SELECT"
6
+ CASE_KEYWORD = "CASE"
7
+ END_KEYWORD = "END"
4
8
 
5
9
  CONTROL_FLOW_KEYWORDS = [
6
10
  "GO",
7
- r"BEGIN\w+TRY",
8
- r"BEGIN\w+CATCH",
11
+ r"BEGIN\s+TRY",
12
+ r"BEGIN\s+CATCH",
9
13
  "BEGIN",
10
- r"END\w+TRY",
11
- r"END\w+CATCH",
12
- "END",
14
+ r"END\s+TRY",
15
+ r"END\s+CATCH",
16
+ # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
17
+ # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
18
+ "IF",
19
+ # For things like CASE, END does not mean the end of a statement.
20
+ # We have special handling for this.
21
+ END_KEYWORD,
22
+ # "ELSE", # else is also valid in CASE, so we we can't use it here.
13
23
  ]
14
24
 
15
25
  # There's an exception to this rule, which is when the statement
16
- # is preceeded by a CTE.
17
- FORCE_NEW_STATEMENT_KEYWORDS = [
26
+ # is preceded by a CTE. For those, we have to check if the character
27
+ # before this is a ")".
28
+ NEW_STATEMENT_KEYWORDS = [
18
29
  # SELECT is used inside queries as well, so we can't include it here.
30
+ "CREATE",
19
31
  "INSERT",
20
32
  "UPDATE",
21
33
  "DELETE",
22
34
  "MERGE",
23
35
  ]
36
+ STRICT_NEW_STATEMENT_KEYWORDS = [
37
+ # For these keywords, a SELECT following it does indicate a new statement.
38
+ "DROP",
39
+ "TRUNCATE",
40
+ ]
41
+
42
+
43
+ class _AlreadyIncremented(Exception):
44
+ # Using exceptions for control flow isn't great - but the code is clearer so it's fine.
45
+ pass
24
46
 
25
47
 
26
48
  class ParserState(Enum):
@@ -30,134 +52,206 @@ class ParserState(Enum):
30
52
  MULTILINE_COMMENT = 4
31
53
 
32
54
 
33
- def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
34
- """
35
- Check if a keyword exists at the given position using regex word boundaries.
36
- """
37
- if pos + len(keyword) > len(sql):
38
- return False
55
+ class _StatementSplitter:
56
+ def __init__(self, sql: str):
57
+ self.sql = sql
39
58
 
40
- # If we're not at a word boundary, we can't generate a keyword.
41
- if pos > 0 and not (
42
- bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
43
- or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
44
- ):
45
- return False
59
+ # Main parser state.
60
+ self.i = 0
61
+ self.state = ParserState.NORMAL
62
+ self.current_statement: List[str] = []
46
63
 
47
- pattern = rf"^{re.escape(keyword)}\b"
48
- match = re.match(pattern, sql[pos:], re.IGNORECASE)
49
- return bool(match)
64
+ # Additional parser state.
50
65
 
66
+ # If we see a SELECT, should we start a new statement?
67
+ # If we previously saw a drop/truncate/etc, a SELECT does mean a new statement.
68
+ # But if we're in a select/create/etc, a select could just be a subquery.
69
+ self.does_select_mean_new_statement = False
51
70
 
52
- def _look_ahead_for_keywords(
53
- sql: str, pos: int, keywords: List[str]
54
- ) -> Tuple[bool, str, int]:
55
- """
56
- Look ahead for SQL keywords at the current position.
57
- """
71
+ # The END keyword terminates CASE and BEGIN blocks.
72
+ # We need to match the CASE statements with END blocks to determine
73
+ # what a given END is closing.
74
+ self.current_case_statements = 0
58
75
 
59
- for keyword in keywords:
60
- if _is_keyword_at_position(sql, pos, keyword):
61
- return True, keyword, len(keyword)
62
- return False, "", 0
76
+ def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
77
+ """
78
+ Check if a keyword exists at the given position using regex word boundaries.
79
+ """
80
+ sql = self.sql
63
81
 
82
+ keyword_length = len(keyword.replace(r"\s+", " "))
64
83
 
65
- def split_statements(sql: str) -> Generator[str, None, None]:
66
- """
67
- Split T-SQL code into individual statements, handling various SQL constructs.
68
- """
69
- if not sql or not sql.strip():
70
- return
84
+ if pos + keyword_length > len(sql):
85
+ return False, ""
86
+
87
+ # If we're not at a word boundary, we can't generate a keyword.
88
+ if pos > 0 and not (
89
+ bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
90
+ or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
91
+ ):
92
+ return False, ""
93
+
94
+ pattern = rf"^{keyword}\b"
95
+ match = re.match(pattern, sql[pos:], re.IGNORECASE)
96
+ is_match = bool(match)
97
+ actual_match = (
98
+ sql[pos:][match.start() : match.end()] if match is not None else ""
99
+ )
100
+ return is_match, actual_match
71
101
 
72
- current_statement: List[str] = []
73
- state = ParserState.NORMAL
74
- i = 0
102
+ def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
103
+ """
104
+ Look ahead for SQL keywords at the current position.
105
+ """
75
106
 
76
- def yield_if_complete() -> Generator[str, None, None]:
77
- statement = "".join(current_statement).strip()
107
+ for keyword in keywords:
108
+ is_match, keyword = self._is_keyword_at_position(self.i, keyword)
109
+ if is_match:
110
+ return True, keyword, len(keyword)
111
+ return False, "", 0
112
+
113
+ def _yield_if_complete(self) -> Iterator[str]:
114
+ statement = "".join(self.current_statement).strip()
78
115
  if statement:
116
+ # Subtle - to avoid losing full whitespace, they get merged into the next statement.
79
117
  yield statement
80
- current_statement.clear()
81
-
82
- prev_real_char = "\0" # the most recent non-whitespace, non-comment character
83
- while i < len(sql):
84
- c = sql[i]
85
- next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
86
-
87
- if state == ParserState.NORMAL:
88
- if c == "'":
89
- state = ParserState.STRING
90
- current_statement.append(c)
91
- prev_real_char = c
92
- elif c == "-" and next_char == "-":
93
- state = ParserState.COMMENT
94
- current_statement.append(c)
95
- current_statement.append(next_char)
96
- i += 1
97
- elif c == "/" and next_char == "*":
98
- state = ParserState.MULTILINE_COMMENT
99
- current_statement.append(c)
100
- current_statement.append(next_char)
101
- i += 1
102
- else:
103
- most_recent_real_char = prev_real_char
104
- if not c.isspace():
105
- prev_real_char = c
118
+ self.current_statement.clear()
106
119
 
107
- is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
108
- sql, i, keywords=CONTROL_FLOW_KEYWORDS
109
- )
110
- if is_control_keyword:
111
- # Yield current statement if any
112
- yield from yield_if_complete()
113
- # Yield keyword as its own statement
114
- yield keyword
115
- i += keyword_len
116
- continue
117
-
118
- (
119
- is_force_new_statement_keyword,
120
- keyword,
121
- keyword_len,
122
- ) = _look_ahead_for_keywords(
123
- sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
124
- )
125
- if (
126
- is_force_new_statement_keyword and most_recent_real_char != ")"
127
- ): # usually we'd have a close paren that closes a CTE
128
- # Force termination of current statement
129
- yield from yield_if_complete()
130
-
131
- current_statement.append(keyword)
132
- i += keyword_len
133
- continue
134
-
135
- elif c == ";":
136
- yield from yield_if_complete()
120
+ # Reset current_statement-specific state.
121
+ self.does_select_mean_new_statement = False
122
+ if self.current_case_statements != 0:
123
+ breakpoint()
124
+ self.current_case_statements = 0
125
+
126
+ def process(self) -> Iterator[str]:
127
+ if not self.sql or not self.sql.strip():
128
+ yield from ()
129
+
130
+ prev_real_char = "\0" # the most recent non-whitespace, non-comment character
131
+ while self.i < len(self.sql):
132
+ c = self.sql[self.i]
133
+ next_char = self.sql[self.i + 1] if self.i < len(self.sql) - 1 else "\0"
134
+
135
+ if self.state == ParserState.NORMAL:
136
+ if c == "'":
137
+ self.state = ParserState.STRING
138
+ self.current_statement.append(c)
139
+ prev_real_char = c
140
+ elif c == "-" and next_char == "-":
141
+ self.state = ParserState.COMMENT
142
+ self.current_statement.append(c)
143
+ self.current_statement.append(next_char)
144
+ self.i += 1
145
+ elif c == "/" and next_char == "*":
146
+ self.state = ParserState.MULTILINE_COMMENT
147
+ self.current_statement.append(c)
148
+ self.current_statement.append(next_char)
149
+ self.i += 1
137
150
  else:
138
- current_statement.append(c)
139
-
140
- elif state == ParserState.STRING:
141
- current_statement.append(c)
142
- if c == "'" and next_char == "'":
143
- current_statement.append(next_char)
144
- i += 1
145
- elif c == "'":
146
- state = ParserState.NORMAL
147
-
148
- elif state == ParserState.COMMENT:
149
- current_statement.append(c)
150
- if c == "\n":
151
- state = ParserState.NORMAL
152
-
153
- elif state == ParserState.MULTILINE_COMMENT:
154
- current_statement.append(c)
155
- if c == "*" and next_char == "/":
156
- current_statement.append(next_char)
157
- i += 1
158
- state = ParserState.NORMAL
159
-
160
- i += 1
161
-
162
- # Handle the last statement
163
- yield from yield_if_complete()
151
+ most_recent_real_char = prev_real_char
152
+ if not c.isspace():
153
+ prev_real_char = c
154
+
155
+ try:
156
+ yield from self._process_normal(
157
+ most_recent_real_char=most_recent_real_char
158
+ )
159
+ except _AlreadyIncremented:
160
+ # Skip the normal i += 1 step.
161
+ continue
162
+
163
+ elif self.state == ParserState.STRING:
164
+ self.current_statement.append(c)
165
+ if c == "'" and next_char == "'":
166
+ self.current_statement.append(next_char)
167
+ self.i += 1
168
+ elif c == "'":
169
+ self.state = ParserState.NORMAL
170
+
171
+ elif self.state == ParserState.COMMENT:
172
+ self.current_statement.append(c)
173
+ if c == "\n":
174
+ self.state = ParserState.NORMAL
175
+
176
+ elif self.state == ParserState.MULTILINE_COMMENT:
177
+ self.current_statement.append(c)
178
+ if c == "*" and next_char == "/":
179
+ self.current_statement.append(next_char)
180
+ self.i += 1
181
+ self.state = ParserState.NORMAL
182
+
183
+ self.i += 1
184
+
185
+ # Handle the last statement
186
+ yield from self._yield_if_complete()
187
+
188
+ def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
189
+ c = self.sql[self.i]
190
+
191
+ if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
192
+ self.current_case_statements += 1
193
+
194
+ is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
195
+ keywords=CONTROL_FLOW_KEYWORDS
196
+ )
197
+ if (
198
+ is_control_keyword
199
+ and keyword == END_KEYWORD
200
+ and self.current_case_statements > 0
201
+ ):
202
+ # If we're closing a CASE statement with END, we can just decrement the counter and continue.
203
+ self.current_case_statements -= 1
204
+ elif is_control_keyword:
205
+ # Yield current statement if any
206
+ yield from self._yield_if_complete()
207
+ # Yield keyword as its own statement
208
+ yield keyword
209
+ self.i += keyword_len
210
+ self.does_select_mean_new_statement = True
211
+ raise _AlreadyIncremented()
212
+
213
+ (
214
+ is_strict_new_statement_keyword,
215
+ keyword,
216
+ keyword_len,
217
+ ) = self._look_ahead_for_keywords(keywords=STRICT_NEW_STATEMENT_KEYWORDS)
218
+ if is_strict_new_statement_keyword:
219
+ yield from self._yield_if_complete()
220
+ self.current_statement.append(keyword)
221
+ self.i += keyword_len
222
+ self.does_select_mean_new_statement = True
223
+ raise _AlreadyIncremented()
224
+
225
+ (
226
+ is_force_new_statement_keyword,
227
+ keyword,
228
+ keyword_len,
229
+ ) = self._look_ahead_for_keywords(
230
+ keywords=(
231
+ NEW_STATEMENT_KEYWORDS
232
+ + ([SELECT_KEYWORD] if self.does_select_mean_new_statement else [])
233
+ ),
234
+ )
235
+ if (
236
+ is_force_new_statement_keyword and most_recent_real_char != ")"
237
+ ): # usually we'd have a close paren that closes a CTE
238
+ # Force termination of current statement
239
+ yield from self._yield_if_complete()
240
+
241
+ self.current_statement.append(keyword)
242
+ self.i += keyword_len
243
+ raise _AlreadyIncremented()
244
+
245
+ if c == ";":
246
+ yield from self._yield_if_complete()
247
+ else:
248
+ self.current_statement.append(c)
249
+
250
+
251
+ def split_statements(sql: str) -> Iterator[str]:
252
+ """
253
+ Split T-SQL code into individual statements, handling various SQL constructs.
254
+ """
255
+
256
+ splitter = _StatementSplitter(sql)
257
+ yield from splitter.process()
@@ -24,12 +24,19 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
24
24
  # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
25
25
  # are case preserving but case insensitive.
26
26
  "mssql",
27
+ # Oracle automatically converts unquoted identifiers to uppercase.
28
+ # https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Database-Object-Names-and-Qualifiers.html#GUID-3C59E44A-5140-4BCA-B9E1-3039C8050C49
29
+ # In our Oracle connector, we then normalize column names to lowercase. This behavior
30
+ # actually comes from the underlying Oracle sqlalchemy dialect.
31
+ # https://github.com/sqlalchemy/sqlalchemy/blob/d9b4d8ff3aae504402d324f3ebf0b8faff78f5dc/lib/sqlalchemy/dialects/oracle/base.py#L2579
32
+ "oracle",
27
33
  }
28
34
  DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
29
35
  # In some dialects, column identifiers are effectively case insensitive
30
36
  # because they are automatically converted to uppercase. Most other systems
31
37
  # automatically lowercase unquoted identifiers.
32
38
  "snowflake",
39
+ "oracle",
33
40
  }
34
41
  assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
35
42
  DIALECTS_WITH_CASE_INSENSITIVE_COLS
@@ -473,7 +473,7 @@ def _create_table_ddl_cll(
473
473
  return column_lineage
474
474
 
475
475
 
476
- def _select_statement_cll( # noqa: C901
476
+ def _select_statement_cll(
477
477
  statement: _SupportedColumnLineageTypes,
478
478
  dialect: sqlglot.Dialect,
479
479
  root_scope: sqlglot.optimizer.Scope,
@@ -56,10 +56,7 @@ def get_dialect(platform: DialectOrStr) -> sqlglot.Dialect:
56
56
  def is_dialect_instance(
57
57
  dialect: sqlglot.Dialect, platforms: Union[str, Iterable[str]]
58
58
  ) -> bool:
59
- if isinstance(platforms, str):
60
- platforms = [platforms]
61
- else:
62
- platforms = list(platforms)
59
+ platforms = [platforms] if isinstance(platforms, str) else list(platforms)
63
60
 
64
61
  dialects = [get_dialect(platform) for platform in platforms]
65
62
 
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  import pathlib
4
3
  from typing import Any, Dict, Optional
5
4
 
@@ -8,11 +7,10 @@ import deepdiff
8
7
  from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
9
8
  from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
10
9
  from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage
10
+ from datahub.testing.pytest_hooks import get_golden_settings
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
- UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true"
15
-
16
14
 
17
15
  def assert_sql_result_with_resolver(
18
16
  sql: str,
@@ -22,6 +20,8 @@ def assert_sql_result_with_resolver(
22
20
  allow_table_error: bool = False,
23
21
  **kwargs: Any,
24
22
  ) -> None:
23
+ settings = get_golden_settings()
24
+
25
25
  # HACK: Our BigQuery source overwrites this value and doesn't undo it.
26
26
  # As such, we need to handle that here.
27
27
  BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd"
@@ -47,15 +47,14 @@ def assert_sql_result_with_resolver(
47
47
  )
48
48
 
49
49
  txt = res.json(indent=4)
50
- if UPDATE_FILES:
50
+ if settings.update_golden:
51
51
  expected_file.write_text(txt)
52
52
  return
53
53
 
54
54
  if not expected_file.exists():
55
55
  expected_file.write_text(txt)
56
56
  raise AssertionError(
57
- f"Expected file {expected_file} does not exist. "
58
- "Created it with the expected output. Please verify it."
57
+ f"Missing expected golden file; run with --update-golden-files to create it: {expected_file}"
59
58
  )
60
59
 
61
60
  expected = SqlParsingResult.parse_raw(expected_file.read_text())
@@ -16,6 +16,7 @@ from deepdiff import DeepDiff
16
16
  from datahub.ingestion.sink.file import write_metadata_file
17
17
  from datahub.ingestion.source.file import read_metadata_file
18
18
  from datahub.testing.mcp_diff import CannotCompareMCPs, MCPDiff, get_aspects_by_urn
19
+ from datahub.testing.pytest_hooks import get_golden_settings
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -40,26 +41,26 @@ def load_json_file(filename: Union[str, os.PathLike]) -> MetadataJson:
40
41
  def assert_metadata_files_equal(
41
42
  output_path: Union[str, os.PathLike],
42
43
  golden_path: Union[str, os.PathLike],
43
- update_golden: bool,
44
- copy_output: bool,
45
44
  ignore_paths: Sequence[str] = (),
46
45
  ignore_paths_v2: Sequence[str] = (),
47
46
  ignore_order: bool = True,
48
47
  ) -> None:
48
+ settings = get_golden_settings()
49
+
49
50
  golden_exists = os.path.isfile(golden_path)
50
51
 
51
- if copy_output:
52
+ if settings.copy_output:
52
53
  shutil.copyfile(str(output_path), str(golden_path) + ".output")
53
54
  logger.info(f"Copied output file to {golden_path}.output")
54
55
 
55
- if not update_golden and not golden_exists:
56
+ if not settings.update_golden and not golden_exists:
56
57
  raise FileNotFoundError(
57
58
  "Golden file does not exist. Please run with the --update-golden-files option to create."
58
59
  )
59
60
 
60
61
  output = load_json_file(output_path)
61
62
 
62
- if update_golden and not golden_exists:
63
+ if settings.update_golden and not golden_exists:
63
64
  shutil.copyfile(str(output_path), str(golden_path))
64
65
  return
65
66
  else:
@@ -87,7 +88,7 @@ def assert_metadata_files_equal(
87
88
  ignore_paths = (*ignore_paths, *default_exclude_paths)
88
89
 
89
90
  diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
90
- if diff and update_golden:
91
+ if diff and settings.update_golden:
91
92
  if isinstance(diff, MCPDiff) and diff.is_delta_valid:
92
93
  logger.info(f"Applying delta to golden file {golden_path}")
93
94
  diff.apply_delta(golden)
@@ -0,0 +1,56 @@
1
+ import dataclasses
2
+ from typing import Optional
3
+
4
+ import pytest
5
+
6
+ __all__ = [
7
+ "load_golden_flags",
8
+ "get_golden_settings",
9
+ "pytest_addoption",
10
+ "GoldenFileSettings",
11
+ ]
12
+
13
+
14
+ @dataclasses.dataclass
15
+ class GoldenFileSettings:
16
+ update_golden: bool
17
+ copy_output: bool
18
+
19
+
20
+ _registered: bool = False
21
+ _settings: Optional[GoldenFileSettings] = None
22
+
23
+
24
+ def pytest_addoption(parser: pytest.Parser) -> None:
25
+ parser.addoption(
26
+ "--update-golden-files",
27
+ action="store_true",
28
+ default=False,
29
+ )
30
+
31
+ # TODO: Deprecate and remove this flag.
32
+ parser.addoption("--copy-output-files", action="store_true", default=False)
33
+
34
+ global _registered
35
+ _registered = True
36
+
37
+
38
+ @pytest.fixture(scope="session", autouse=True)
39
+ def load_golden_flags(pytestconfig: pytest.Config) -> None:
40
+ global _settings
41
+ _settings = GoldenFileSettings(
42
+ update_golden=pytestconfig.getoption("--update-golden-files"),
43
+ copy_output=pytestconfig.getoption("--copy-output-files"),
44
+ )
45
+
46
+
47
+ def get_golden_settings() -> GoldenFileSettings:
48
+ if not _registered:
49
+ raise ValueError(
50
+ "Golden files aren't set up properly. Call register_golden_flags from a conftest pytest_addoptions method."
51
+ )
52
+ if not _settings:
53
+ raise ValueError(
54
+ "Golden files aren't set up properly. Ensure load_golden_flags is imported in your conftest."
55
+ )
56
+ return _settings
@@ -293,9 +293,9 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
293
293
  return server.version.micro - client.version.micro
294
294
 
295
295
 
296
- def _maybe_print_upgrade_message( # noqa: C901
296
+ def _maybe_print_upgrade_message(
297
297
  version_stats: Optional[DataHubVersionStats],
298
- ) -> None: # noqa: C901
298
+ ) -> None:
299
299
  days_before_cli_stale = 7
300
300
  days_before_quickstart_stale = 7
301
301
 
@@ -10,13 +10,11 @@ import tempfile
10
10
  import threading
11
11
  from dataclasses import dataclass, field
12
12
  from datetime import datetime
13
- from enum import Enum
14
13
  from types import TracebackType
15
14
  from typing import (
16
15
  Any,
17
16
  Callable,
18
17
  Dict,
19
- Final,
20
18
  Generic,
21
19
  Iterator,
22
20
  List,
@@ -31,6 +29,7 @@ from typing import (
31
29
  )
32
30
 
33
31
  from datahub.ingestion.api.closeable import Closeable
32
+ from datahub.utilities.sentinels import Unset, unset
34
33
 
35
34
  logger: logging.Logger = logging.getLogger(__name__)
36
35
 
@@ -59,16 +58,6 @@ SqliteValue = Union[int, float, str, bytes, datetime, None]
59
58
  _VT = TypeVar("_VT")
60
59
 
61
60
 
62
- class Unset(Enum):
63
- token = 0
64
-
65
-
66
- # It's pretty annoying to create a true sentinel that works with typing.
67
- # https://peps.python.org/pep-0484/#support-for-singleton-types-in-unions
68
- # Can't wait for https://peps.python.org/pep-0661/
69
- _unset: Final = Unset.token
70
-
71
-
72
61
  class ConnectionWrapper:
73
62
  """
74
63
  Wraps a SQlite connection, allowing connection reuse across multiple FileBacked* objects.
@@ -372,7 +361,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
372
361
  self,
373
362
  /,
374
363
  key: str,
375
- default: Union[_VT, Unset] = _unset,
364
+ default: Union[_VT, Unset] = unset,
376
365
  ) -> _VT:
377
366
  # If key is in the dictionary, this is similar to __getitem__ + mark_dirty.
378
367
  # If key is not in the dictionary, this is similar to __setitem__.
@@ -383,7 +372,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
383
372
  self.mark_dirty(key)
384
373
  return value
385
374
  except KeyError:
386
- if default is _unset:
375
+ if default is unset:
387
376
  raise
388
377
 
389
378
  self[key] = default