acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,374 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ from typing import (
5
+ Any,
6
+ List,
7
+ Sequence,
8
+ TypedDict,
9
+ Union,
10
+ )
11
+
12
+ import pydantic
13
+
14
+ from datahub.configuration.common import ConfigModel
15
+ from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
+ from datahub.ingestion.graph.client import entity_type_to_graphql
17
+ from datahub.ingestion.graph.filters import SearchFilterRule
18
+ from datahub.metadata.schema_classes import EntityTypeName
19
+ from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
+
21
+ _AndSearchFilterRule = TypedDict(
22
+ "_AndSearchFilterRule", {"and": List[SearchFilterRule]}
23
+ )
24
+ _OrFilters = List[_AndSearchFilterRule]
25
+
26
+
27
+ class _BaseFilter(ConfigModel):
28
+ class Config:
29
+ # We can't wrap this in a TYPE_CHECKING block because the pydantic plugin
30
+ # doesn't recognize it properly. So unfortunately we'll need to live
31
+ # with the deprecation warning w/ pydantic v2.
32
+ allow_population_by_field_name = True
33
+ if PYDANTIC_VERSION_2:
34
+ populate_by_name = True
35
+
36
+ @abc.abstractmethod
37
+ def compile(self) -> _OrFilters:
38
+ pass
39
+
40
+
41
+ def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
+ if entity_type.upper() == entity_type:
43
+ # Assume that we were passed a graphql EntityType enum value,
44
+ # so no conversion is needed.
45
+ return entity_type
46
+ return entity_type_to_graphql(entity_type)
47
+
48
+
49
+ class _EntityTypeFilter(_BaseFilter):
50
+ entity_type: List[str] = pydantic.Field(
51
+ description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
+ )
53
+
54
+ def _build_rule(self) -> SearchFilterRule:
55
+ return SearchFilterRule(
56
+ field="_entityType",
57
+ condition="EQUAL",
58
+ values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
+ )
60
+
61
+ def compile(self) -> _OrFilters:
62
+ return [{"and": [self._build_rule()]}]
63
+
64
+
65
+ class _EntitySubtypeFilter(_BaseFilter):
66
+ entity_type: str
67
+ entity_subtype: str = pydantic.Field(
68
+ description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
69
+ )
70
+
71
+ def compile(self) -> _OrFilters:
72
+ rules = [
73
+ SearchFilterRule(
74
+ field="_entityType",
75
+ condition="EQUAL",
76
+ values=[_flexible_entity_type_to_graphql(self.entity_type)],
77
+ ),
78
+ SearchFilterRule(
79
+ field="typeNames",
80
+ condition="EQUAL",
81
+ values=[self.entity_subtype],
82
+ ),
83
+ ]
84
+ return [{"and": rules}]
85
+
86
+
87
+ class _PlatformFilter(_BaseFilter):
88
+ platform: List[str]
89
+ # TODO: Add validator to convert string -> list of strings
90
+
91
+ @pydantic.validator("platform", each_item=True)
92
+ def validate_platform(cls, v: str) -> str:
93
+ # Subtle - we use the constructor instead of the from_string method
94
+ # because coercion is acceptable here.
95
+ return str(DataPlatformUrn(v))
96
+
97
+ def _build_rule(self) -> SearchFilterRule:
98
+ return SearchFilterRule(
99
+ field="platform.keyword",
100
+ condition="EQUAL",
101
+ values=self.platform,
102
+ )
103
+
104
+ def compile(self) -> _OrFilters:
105
+ return [{"and": [self._build_rule()]}]
106
+
107
+
108
+ class _DomainFilter(_BaseFilter):
109
+ domain: List[str]
110
+
111
+ @pydantic.validator("domain", each_item=True)
112
+ def validate_domain(cls, v: str) -> str:
113
+ return str(DomainUrn.from_string(v))
114
+
115
+ def _build_rule(self) -> SearchFilterRule:
116
+ return SearchFilterRule(
117
+ field="domains",
118
+ condition="EQUAL",
119
+ values=self.domain,
120
+ )
121
+
122
+ def compile(self) -> _OrFilters:
123
+ return [{"and": [self._build_rule()]}]
124
+
125
+
126
+ class _EnvFilter(_BaseFilter):
127
+ # Note that not all entity types have an env (e.g. dashboards / charts).
128
+ # If the env filter is specified, these will be excluded.
129
+ env: List[str]
130
+
131
+ def compile(self) -> _OrFilters:
132
+ return [
133
+ # For most entity types, we look at the origin field.
134
+ {
135
+ "and": [
136
+ SearchFilterRule(
137
+ field="origin",
138
+ condition="EQUAL",
139
+ values=self.env,
140
+ ),
141
+ ]
142
+ },
143
+ # For containers, we now have an "env" property as of
144
+ # https://github.com/datahub-project/datahub/pull/11214
145
+ # Prior to this, we put "env" in the customProperties. But we're
146
+ # not bothering with that here.
147
+ {
148
+ "and": [
149
+ SearchFilterRule(
150
+ field="env",
151
+ condition="EQUAL",
152
+ values=self.env,
153
+ ),
154
+ ]
155
+ },
156
+ ]
157
+
158
+
159
+ class _CustomCondition(_BaseFilter):
160
+ """Represents a single field condition"""
161
+
162
+ field: str
163
+ condition: str
164
+ values: List[str]
165
+
166
+ def compile(self) -> _OrFilters:
167
+ rule = SearchFilterRule(
168
+ field=self.field,
169
+ condition=self.condition,
170
+ values=self.values,
171
+ )
172
+ return [{"and": [rule]}]
173
+
174
+
175
+ class _And(_BaseFilter):
176
+ """Represents an AND conjunction of filters"""
177
+
178
+ and_: Sequence["Filter"] = pydantic.Field(alias="and")
179
+ # TODO: Add validator to ensure that the "and" field is not empty
180
+
181
+ def compile(self) -> _OrFilters:
182
+ # The "and" operator must be implemented by doing a Cartesian product
183
+ # of the OR clauses.
184
+ # Example 1:
185
+ # (A or B) and (C or D) ->
186
+ # (A and C) or (A and D) or (B and C) or (B and D)
187
+ # Example 2:
188
+ # (A or B) and (C or D) and (E or F) ->
189
+ # (A and C and E) or (A and C and F) or (A and D and E) or (A and D and F) or
190
+ # (B and C and E) or (B and C and F) or (B and D and E) or (B and D and F)
191
+
192
+ # Start with the first filter's OR clauses
193
+ result = self.and_[0].compile()
194
+
195
+ # For each subsequent filter
196
+ for filter in self.and_[1:]:
197
+ new_result = []
198
+ # Get its OR clauses
199
+ other_clauses = filter.compile()
200
+
201
+ # Create Cartesian product
202
+ for existing_clause in result:
203
+ for other_clause in other_clauses:
204
+ # Merge the AND conditions from both clauses
205
+ new_result.append(self._merge_ands(existing_clause, other_clause))
206
+
207
+ result = new_result
208
+
209
+ return result
210
+
211
+ @classmethod
212
+ def _merge_ands(
213
+ cls, a: _AndSearchFilterRule, b: _AndSearchFilterRule
214
+ ) -> _AndSearchFilterRule:
215
+ return {
216
+ "and": [
217
+ *a["and"],
218
+ *b["and"],
219
+ ]
220
+ }
221
+
222
+
223
+ class _Or(_BaseFilter):
224
+ """Represents an OR conjunction of filters"""
225
+
226
+ or_: Sequence["Filter"] = pydantic.Field(alias="or")
227
+ # TODO: Add validator to ensure that the "or" field is not empty
228
+
229
+ def compile(self) -> _OrFilters:
230
+ merged_filter = []
231
+ for filter in self.or_:
232
+ merged_filter.extend(filter.compile())
233
+ return merged_filter
234
+
235
+
236
+ class _Not(_BaseFilter):
237
+ """Represents a NOT filter"""
238
+
239
+ not_: "Filter" = pydantic.Field(alias="not")
240
+
241
+ @pydantic.validator("not_", pre=False)
242
+ def validate_not(cls, v: "Filter") -> "Filter":
243
+ inner_filter = v.compile()
244
+ if len(inner_filter) != 1:
245
+ raise ValueError(
246
+ "Cannot negate a filter with multiple OR clauses [not yet supported]"
247
+ )
248
+ return v
249
+
250
+ def compile(self) -> _OrFilters:
251
+ # TODO: Eventually we'll want to implement a full DNF normalizer.
252
+ # https://en.wikipedia.org/wiki/Disjunctive_normal_form#Conversion_to_DNF
253
+
254
+ inner_filter = self.not_.compile()
255
+ assert len(inner_filter) == 1 # validated above
256
+
257
+ # ¬(A and B) -> (¬A) OR (¬B)
258
+ and_filters = inner_filter[0]["and"]
259
+ final_filters: _OrFilters = []
260
+ for rule in and_filters:
261
+ final_filters.append({"and": [rule.negate()]})
262
+
263
+ return final_filters
264
+
265
+
266
+ # TODO: With pydantic 2, we can use a RootModel with a
267
+ # discriminated union to make the error messages more informative.
268
+ Filter = Union[
269
+ _And,
270
+ _Or,
271
+ _Not,
272
+ _EntityTypeFilter,
273
+ _EntitySubtypeFilter,
274
+ _PlatformFilter,
275
+ _DomainFilter,
276
+ _EnvFilter,
277
+ _CustomCondition,
278
+ ]
279
+
280
+
281
+ # Required to resolve forward references to "Filter"
282
+ if PYDANTIC_VERSION_2:
283
+ _And.model_rebuild() # type: ignore
284
+ _Or.model_rebuild() # type: ignore
285
+ _Not.model_rebuild() # type: ignore
286
+ else:
287
+ _And.update_forward_refs()
288
+ _Or.update_forward_refs()
289
+ _Not.update_forward_refs()
290
+
291
+
292
+ def load_filters(obj: Any) -> Filter:
293
+ if PYDANTIC_VERSION_2:
294
+ return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
295
+ else:
296
+ return pydantic.parse_obj_as(Filter, obj) # type: ignore
297
+
298
+
299
+ # We need FilterDsl for two reasons:
300
+ # 1. To provide wrapper methods around lots of filters while avoid bloating the
301
+ # yaml spec.
302
+ # 2. Pydantic models in general don't support positional arguments, making the
303
+ # calls feel repetitive (e.g. Platform(platform=...)).
304
+ # See https://github.com/pydantic/pydantic/issues/6792
305
+ # We also considered using dataclasses / pydantic dataclasses, but
306
+ # ultimately decided that they didn't quite suit our requirements,
307
+ # particularly with regards to the field aliases for and/or/not.
308
+ class FilterDsl:
309
+ @staticmethod
310
+ def and_(*args: "Filter") -> _And:
311
+ return _And(and_=list(args))
312
+
313
+ @staticmethod
314
+ def or_(*args: "Filter") -> _Or:
315
+ return _Or(or_=list(args))
316
+
317
+ @staticmethod
318
+ def not_(arg: "Filter") -> _Not:
319
+ return _Not(not_=arg)
320
+
321
+ @staticmethod
322
+ def entity_type(
323
+ entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
324
+ ) -> _EntityTypeFilter:
325
+ return _EntityTypeFilter(
326
+ entity_type=(
327
+ [entity_type] if isinstance(entity_type, str) else list(entity_type)
328
+ )
329
+ )
330
+
331
+ @staticmethod
332
+ def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
333
+ return _EntitySubtypeFilter(
334
+ entity_type=entity_type,
335
+ entity_subtype=subtype,
336
+ )
337
+
338
+ @staticmethod
339
+ def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
340
+ return _PlatformFilter(
341
+ platform=[platform] if isinstance(platform, str) else platform
342
+ )
343
+
344
+ # TODO: Add a platform_instance filter
345
+
346
+ @staticmethod
347
+ def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
348
+ return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
349
+
350
+ @staticmethod
351
+ def env(env: Union[str, List[str]], /) -> _EnvFilter:
352
+ return _EnvFilter(env=[env] if isinstance(env, str) else env)
353
+
354
+ @staticmethod
355
+ def has_custom_property(key: str, value: str) -> _CustomCondition:
356
+ return _CustomCondition(
357
+ field="customProperties",
358
+ condition="EQUAL",
359
+ values=[f"{key}={value}"],
360
+ )
361
+
362
+ # TODO: Add a soft-deletion status filter
363
+ # TODO: add a container / browse path filter
364
+ # TODO add shortcut for custom filters
365
+
366
+ @staticmethod
367
+ def custom_filter(
368
+ field: str, condition: str, values: List[str]
369
+ ) -> _CustomCondition:
370
+ return _CustomCondition(
371
+ field=field,
372
+ condition=condition,
373
+ values=values,
374
+ )
@@ -15,6 +15,7 @@ from datahub.metadata.schema_classes import (
15
15
  UpstreamClass as Upstream,
16
16
  UpstreamLineageClass as UpstreamLineage,
17
17
  )
18
+ from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
18
19
  from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
19
20
  from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
20
21
  from datahub.specific.aspect_helpers.structured_properties import (
@@ -22,8 +23,6 @@ from datahub.specific.aspect_helpers.structured_properties import (
22
23
  )
23
24
  from datahub.specific.aspect_helpers.tags import HasTagsPatch
24
25
  from datahub.specific.aspect_helpers.terms import HasTermsPatch
25
- from datahub.utilities.urns.tag_urn import TagUrn
26
- from datahub.utilities.urns.urn import Urn
27
26
 
28
27
  _Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
29
28
 
@@ -104,12 +103,12 @@ class DatasetPatchBuilder(
104
103
  ):
105
104
  def __init__(
106
105
  self,
107
- urn: str,
106
+ urn: Union[str, DatasetUrn],
108
107
  system_metadata: Optional[SystemMetadataClass] = None,
109
108
  audit_header: Optional[KafkaAuditHeaderClass] = None,
110
109
  ) -> None:
111
110
  super().__init__(
112
- urn, system_metadata=system_metadata, audit_header=audit_header
111
+ str(urn), system_metadata=system_metadata, audit_header=audit_header
113
112
  )
114
113
 
115
114
  @classmethod
@@ -172,17 +172,9 @@ def _patch_lineage() -> None:
172
172
  derived_tables = [
173
173
  source.expression.parent
174
174
  for source in scope.sources.values()
175
- @@ -254,6 +257,7 @@ def to_node(
176
- if dt.comments and dt.comments[0].startswith("source: ")
177
- }
178
-
179
- + c: exp.Column
180
- for c in source_columns:
181
- table = c.table
182
- source = scope.sources.get(table)
183
175
  @@ -281,8 +285,21 @@ def to_node(
184
- # it means this column's lineage is unknown. This can happen if the definition of a source used in a query
185
- # is not passed into the `sources` map.
176
+ # is unknown. This can happen if the definition of a source used in a query is not
177
+ # passed into the `sources` map.
186
178
  source = source or exp.Placeholder()
187
179
  +
188
180
  + subfields = []
@@ -13,7 +13,7 @@ from datahub.ingestion.graph.client import DataHubGraph
13
13
  from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
14
14
  from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass
15
15
  from datahub.metadata.urns import DataPlatformUrn
16
- from datahub.sql_parsing._models import _TableName as _TableName # noqa: I250
16
+ from datahub.sql_parsing._models import _TableName as _TableName
17
17
  from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES
18
18
  from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
19
19
  from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
@@ -8,11 +8,11 @@ END_KEYWORD = "END"
8
8
 
9
9
  CONTROL_FLOW_KEYWORDS = [
10
10
  "GO",
11
- r"BEGIN\w+TRY",
12
- r"BEGIN\w+CATCH",
11
+ r"BEGIN\s+TRY",
12
+ r"BEGIN\s+CATCH",
13
13
  "BEGIN",
14
- r"END\w+TRY",
15
- r"END\w+CATCH",
14
+ r"END\s+TRY",
15
+ r"END\s+CATCH",
16
16
  # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
17
17
  # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
18
18
  "IF",
@@ -73,25 +73,31 @@ class _StatementSplitter:
73
73
  # what a given END is closing.
74
74
  self.current_case_statements = 0
75
75
 
76
- def _is_keyword_at_position(self, pos: int, keyword: str) -> bool:
76
+ def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
77
77
  """
78
78
  Check if a keyword exists at the given position using regex word boundaries.
79
79
  """
80
80
  sql = self.sql
81
81
 
82
- if pos + len(keyword) > len(sql):
83
- return False
82
+ keyword_length = len(keyword.replace(r"\s+", " "))
83
+
84
+ if pos + keyword_length > len(sql):
85
+ return False, ""
84
86
 
85
87
  # If we're not at a word boundary, we can't generate a keyword.
86
88
  if pos > 0 and not (
87
89
  bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
88
90
  or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
89
91
  ):
90
- return False
92
+ return False, ""
91
93
 
92
- pattern = rf"^{re.escape(keyword)}\b"
94
+ pattern = rf"^{keyword}\b"
93
95
  match = re.match(pattern, sql[pos:], re.IGNORECASE)
94
- return bool(match)
96
+ is_match = bool(match)
97
+ actual_match = (
98
+ sql[pos:][match.start() : match.end()] if match is not None else ""
99
+ )
100
+ return is_match, actual_match
95
101
 
96
102
  def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
97
103
  """
@@ -99,7 +105,8 @@ class _StatementSplitter:
99
105
  """
100
106
 
101
107
  for keyword in keywords:
102
- if self._is_keyword_at_position(self.i, keyword):
108
+ is_match, keyword = self._is_keyword_at_position(self.i, keyword)
109
+ if is_match:
103
110
  return True, keyword, len(keyword)
104
111
  return False, "", 0
105
112
 
@@ -118,7 +125,7 @@ class _StatementSplitter:
118
125
 
119
126
  def process(self) -> Iterator[str]:
120
127
  if not self.sql or not self.sql.strip():
121
- return
128
+ yield from ()
122
129
 
123
130
  prev_real_char = "\0" # the most recent non-whitespace, non-comment character
124
131
  while self.i < len(self.sql):
@@ -181,7 +188,7 @@ class _StatementSplitter:
181
188
  def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
182
189
  c = self.sql[self.i]
183
190
 
184
- if self._is_keyword_at_position(self.i, CASE_KEYWORD):
191
+ if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
185
192
  self.current_case_statements += 1
186
193
 
187
194
  is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
@@ -24,12 +24,19 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
24
24
  # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
25
25
  # are case preserving but case insensitive.
26
26
  "mssql",
27
+ # Oracle automatically converts unquoted identifiers to uppercase.
28
+ # https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Database-Object-Names-and-Qualifiers.html#GUID-3C59E44A-5140-4BCA-B9E1-3039C8050C49
29
+ # In our Oracle connector, we then normalize column names to lowercase. This behavior
30
+ # actually comes from the underlying Oracle sqlalchemy dialect.
31
+ # https://github.com/sqlalchemy/sqlalchemy/blob/d9b4d8ff3aae504402d324f3ebf0b8faff78f5dc/lib/sqlalchemy/dialects/oracle/base.py#L2579
32
+ "oracle",
27
33
  }
28
34
  DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
29
35
  # In some dialects, column identifiers are effectively case insensitive
30
36
  # because they are automatically converted to uppercase. Most other systems
31
37
  # automatically lowercase unquoted identifiers.
32
38
  "snowflake",
39
+ "oracle",
33
40
  }
34
41
  assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
35
42
  DIALECTS_WITH_CASE_INSENSITIVE_COLS
@@ -473,7 +473,7 @@ def _create_table_ddl_cll(
473
473
  return column_lineage
474
474
 
475
475
 
476
- def _select_statement_cll( # noqa: C901
476
+ def _select_statement_cll(
477
477
  statement: _SupportedColumnLineageTypes,
478
478
  dialect: sqlglot.Dialect,
479
479
  root_scope: sqlglot.optimizer.Scope,
@@ -56,10 +56,7 @@ def get_dialect(platform: DialectOrStr) -> sqlglot.Dialect:
56
56
  def is_dialect_instance(
57
57
  dialect: sqlglot.Dialect, platforms: Union[str, Iterable[str]]
58
58
  ) -> bool:
59
- if isinstance(platforms, str):
60
- platforms = [platforms]
61
- else:
62
- platforms = list(platforms)
59
+ platforms = [platforms] if isinstance(platforms, str) else list(platforms)
63
60
 
64
61
  dialects = [get_dialect(platform) for platform in platforms]
65
62
 
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import os
3
2
  import pathlib
4
3
  from typing import Any, Dict, Optional
5
4
 
@@ -8,11 +7,10 @@ import deepdiff
8
7
  from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
9
8
  from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
10
9
  from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage
10
+ from datahub.testing.pytest_hooks import get_golden_settings
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
- UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true"
15
-
16
14
 
17
15
  def assert_sql_result_with_resolver(
18
16
  sql: str,
@@ -22,6 +20,8 @@ def assert_sql_result_with_resolver(
22
20
  allow_table_error: bool = False,
23
21
  **kwargs: Any,
24
22
  ) -> None:
23
+ settings = get_golden_settings()
24
+
25
25
  # HACK: Our BigQuery source overwrites this value and doesn't undo it.
26
26
  # As such, we need to handle that here.
27
27
  BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd"
@@ -47,15 +47,14 @@ def assert_sql_result_with_resolver(
47
47
  )
48
48
 
49
49
  txt = res.json(indent=4)
50
- if UPDATE_FILES:
50
+ if settings.update_golden:
51
51
  expected_file.write_text(txt)
52
52
  return
53
53
 
54
54
  if not expected_file.exists():
55
55
  expected_file.write_text(txt)
56
56
  raise AssertionError(
57
- f"Expected file {expected_file} does not exist. "
58
- "Created it with the expected output. Please verify it."
57
+ f"Missing expected golden file; run with --update-golden-files to create it: {expected_file}"
59
58
  )
60
59
 
61
60
  expected = SqlParsingResult.parse_raw(expected_file.read_text())
@@ -16,6 +16,7 @@ from deepdiff import DeepDiff
16
16
  from datahub.ingestion.sink.file import write_metadata_file
17
17
  from datahub.ingestion.source.file import read_metadata_file
18
18
  from datahub.testing.mcp_diff import CannotCompareMCPs, MCPDiff, get_aspects_by_urn
19
+ from datahub.testing.pytest_hooks import get_golden_settings
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -40,26 +41,26 @@ def load_json_file(filename: Union[str, os.PathLike]) -> MetadataJson:
40
41
  def assert_metadata_files_equal(
41
42
  output_path: Union[str, os.PathLike],
42
43
  golden_path: Union[str, os.PathLike],
43
- update_golden: bool,
44
- copy_output: bool,
45
44
  ignore_paths: Sequence[str] = (),
46
45
  ignore_paths_v2: Sequence[str] = (),
47
46
  ignore_order: bool = True,
48
47
  ) -> None:
48
+ settings = get_golden_settings()
49
+
49
50
  golden_exists = os.path.isfile(golden_path)
50
51
 
51
- if copy_output:
52
+ if settings.copy_output:
52
53
  shutil.copyfile(str(output_path), str(golden_path) + ".output")
53
54
  logger.info(f"Copied output file to {golden_path}.output")
54
55
 
55
- if not update_golden and not golden_exists:
56
+ if not settings.update_golden and not golden_exists:
56
57
  raise FileNotFoundError(
57
58
  "Golden file does not exist. Please run with the --update-golden-files option to create."
58
59
  )
59
60
 
60
61
  output = load_json_file(output_path)
61
62
 
62
- if update_golden and not golden_exists:
63
+ if settings.update_golden and not golden_exists:
63
64
  shutil.copyfile(str(output_path), str(golden_path))
64
65
  return
65
66
  else:
@@ -87,7 +88,7 @@ def assert_metadata_files_equal(
87
88
  ignore_paths = (*ignore_paths, *default_exclude_paths)
88
89
 
89
90
  diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
90
- if diff and update_golden:
91
+ if diff and settings.update_golden:
91
92
  if isinstance(diff, MCPDiff) and diff.is_delta_valid:
92
93
  logger.info(f"Applying delta to golden file {golden_path}")
93
94
  diff.apply_delta(golden)