acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show
  1. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
  2. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
  3. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
  59. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  62. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  63. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  64. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  65. datahub/ingestion/source/sql/mssql/source.py +8 -4
  66. datahub/ingestion/source/sql/oracle.py +51 -4
  67. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  68. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  69. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  70. datahub/ingestion/source/superset.py +291 -35
  71. datahub/ingestion/source/usage/usage_common.py +0 -65
  72. datahub/ingestion/source/vertexai/__init__.py +0 -0
  73. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  74. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  75. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  76. datahub/metadata/_schema_classes.py +472 -1
  77. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  80. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  81. datahub/metadata/schema.avsc +313 -2
  82. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  83. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  84. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  85. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  86. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  87. datahub/metadata/schemas/Deprecation.avsc +2 -0
  88. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  89. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  90. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  91. datahub/metadata/schemas/Siblings.avsc +2 -0
  92. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  93. datahub/sdk/__init__.py +1 -0
  94. datahub/sdk/dataset.py +122 -0
  95. datahub/sdk/entity.py +99 -3
  96. datahub/sdk/entity_client.py +27 -3
  97. datahub/sdk/main_client.py +24 -1
  98. datahub/sdk/search_client.py +81 -8
  99. datahub/sdk/search_filters.py +94 -37
  100. datahub/sql_parsing/split_statements.py +17 -3
  101. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  102. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  103. datahub/testing/mcp_diff.py +1 -18
  104. datahub/utilities/threaded_iterator_executor.py +16 -3
  105. datahub/ingestion/source/vertexai.py +0 -697
  106. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  107. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -2,36 +2,106 @@ from __future__ import annotations
2
2
 
3
3
  from typing import (
4
4
  TYPE_CHECKING,
5
- Dict,
6
5
  Iterable,
7
6
  List,
8
7
  Optional,
8
+ Tuple,
9
+ Type,
10
+ TypeVar,
9
11
  )
10
12
 
11
- from datahub.ingestion.graph.filters import RawSearchFilterRule
13
+ from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
12
14
  from datahub.metadata.urns import Urn
13
- from datahub.sdk.search_filters import Filter
15
+ from datahub.sdk.search_filters import (
16
+ Filter,
17
+ FilterDsl,
18
+ _EntityTypeFilter,
19
+ _OrFilters,
20
+ _StatusFilter,
21
+ )
14
22
 
15
23
  if TYPE_CHECKING:
16
24
  from datahub.sdk.main_client import DataHubClient
17
25
 
18
26
 
27
+ _FilterType = TypeVar("_FilterType", bound=Filter)
28
+
29
+
30
+ def _typed_dfs(
31
+ filter: Optional[_FilterType], type: Type[_FilterType]
32
+ ) -> Optional[List[_FilterType]]:
33
+ if filter is None:
34
+ return None
35
+
36
+ found: Optional[List[_FilterType]] = None
37
+ for f in filter.dfs():
38
+ if isinstance(f, type):
39
+ if found is None:
40
+ found = []
41
+ found.append(f)
42
+ return found
43
+
44
+
19
45
  def compile_filters(
20
46
  filter: Optional[Filter],
21
- ) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
47
+ ) -> Tuple[Optional[List[str]], RawSearchFilter]:
22
48
  # TODO: Not every filter type is supported for every entity type.
23
49
  # If we can detect issues with the filters at compile time, we should
24
50
  # raise an error.
25
51
 
26
- if filter is None:
27
- return None
52
+ existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
53
+ if existing_soft_deleted_filter is None:
54
+ soft_deleted_filter = FilterDsl.soft_deleted(
55
+ RemovedStatusFilter.NOT_SOFT_DELETED
56
+ )
57
+ if filter is None:
58
+ filter = soft_deleted_filter
59
+ else:
60
+ filter = FilterDsl.and_(filter, soft_deleted_filter)
61
+
62
+ # This should be safe - if filter were None coming in, then we would replace it
63
+ # with the soft-deleted filter.
64
+ assert filter is not None
28
65
 
29
66
  initial_filters = filter.compile()
30
- return [
67
+
68
+ compiled_filters: RawSearchFilter = [
31
69
  {"and": [rule.to_raw() for rule in andClause["and"]]}
32
70
  for andClause in initial_filters
33
71
  ]
34
72
 
73
+ entity_types = compute_entity_types(initial_filters)
74
+
75
+ return entity_types, compiled_filters
76
+
77
+
78
+ def compute_entity_types(
79
+ filters: _OrFilters,
80
+ ) -> Optional[List[str]]:
81
+ found_filters = False
82
+ found_positive_filters = False
83
+ entity_types: List[str] = []
84
+ for ands in filters:
85
+ for clause in ands["and"]:
86
+ if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
87
+ found_filters = True
88
+ if not clause.negated:
89
+ found_positive_filters = True
90
+
91
+ entity_types.extend(clause.values)
92
+
93
+ if not found_filters:
94
+ # If we didn't find any filters, use None so we use the default set.
95
+ return None
96
+
97
+ if not found_positive_filters:
98
+ # If we only found negated filters, then it's probably a query like
99
+ # "find me all entities except for dashboards". In that case, we
100
+ # still want to use the default set.
101
+ return None
102
+
103
+ return entity_types
104
+
35
105
 
36
106
  class SearchClient:
37
107
  def __init__(self, client: DataHubClient):
@@ -43,8 +113,11 @@ class SearchClient:
43
113
  filter: Optional[Filter] = None,
44
114
  ) -> Iterable[Urn]:
45
115
  # TODO: Add better limit / pagination support.
116
+ types, compiled_filters = compile_filters(filter)
46
117
  for urn in self._client._graph.get_urns_by_filter(
47
118
  query=query,
48
- extra_or_filters=compile_filters(filter),
119
+ status=None,
120
+ extra_or_filters=compiled_filters,
121
+ entity_types=types,
49
122
  ):
50
123
  yield Urn.from_string(urn)
@@ -3,7 +3,10 @@ from __future__ import annotations
3
3
  import abc
4
4
  from typing import (
5
5
  Any,
6
+ ClassVar,
7
+ Iterator,
6
8
  List,
9
+ Optional,
7
10
  Sequence,
8
11
  TypedDict,
9
12
  Union,
@@ -13,8 +16,13 @@ import pydantic
13
16
 
14
17
  from datahub.configuration.common import ConfigModel
15
18
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
- from datahub.ingestion.graph.client import entity_type_to_graphql
17
- from datahub.ingestion.graph.filters import SearchFilterRule
19
+ from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
20
+ from datahub.ingestion.graph.filters import (
21
+ FilterOperator,
22
+ RemovedStatusFilter,
23
+ SearchFilterRule,
24
+ _get_status_filter,
25
+ )
18
26
  from datahub.metadata.schema_classes import EntityTypeName
19
27
  from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
28
 
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
37
45
  def compile(self) -> _OrFilters:
38
46
  pass
39
47
 
40
-
41
- def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
- if entity_type.upper() == entity_type:
43
- # Assume that we were passed a graphql EntityType enum value,
44
- # so no conversion is needed.
45
- return entity_type
46
- return entity_type_to_graphql(entity_type)
48
+ def dfs(self) -> Iterator[_BaseFilter]:
49
+ yield self
47
50
 
48
51
 
49
52
  class _EntityTypeFilter(_BaseFilter):
53
+ """Filter for specific entity types.
54
+
55
+ If no entity type filter is specified, we will search all entity types in the
56
+ default search set, mirroring the behavior of the DataHub UI.
57
+ """
58
+
59
+ ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
60
+
50
61
  entity_type: List[str] = pydantic.Field(
51
62
  description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
63
  )
53
64
 
54
65
  def _build_rule(self) -> SearchFilterRule:
55
66
  return SearchFilterRule(
56
- field="_entityType",
67
+ field=self.ENTITY_TYPE_FIELD,
57
68
  condition="EQUAL",
58
- values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
69
+ values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
70
  )
60
71
 
61
72
  def compile(self) -> _OrFilters:
@@ -63,25 +74,39 @@ class _EntityTypeFilter(_BaseFilter):
63
74
 
64
75
 
65
76
  class _EntitySubtypeFilter(_BaseFilter):
66
- entity_type: str
67
77
  entity_subtype: str = pydantic.Field(
68
78
  description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
69
79
  )
70
80
 
81
+ def _build_rule(self) -> SearchFilterRule:
82
+ return SearchFilterRule(
83
+ field="typeNames",
84
+ condition="EQUAL",
85
+ values=[self.entity_subtype],
86
+ )
87
+
71
88
  def compile(self) -> _OrFilters:
72
- rules = [
73
- SearchFilterRule(
74
- field="_entityType",
75
- condition="EQUAL",
76
- values=[_flexible_entity_type_to_graphql(self.entity_type)],
77
- ),
78
- SearchFilterRule(
79
- field="typeNames",
80
- condition="EQUAL",
81
- values=[self.entity_subtype],
82
- ),
83
- ]
84
- return [{"and": rules}]
89
+ return [{"and": [self._build_rule()]}]
90
+
91
+
92
+ class _StatusFilter(_BaseFilter):
93
+ """Filter for the status of entities during search.
94
+
95
+ If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
96
+ """
97
+
98
+ status: RemovedStatusFilter
99
+
100
+ def _build_rule(self) -> Optional[SearchFilterRule]:
101
+ return _get_status_filter(self.status)
102
+
103
+ def compile(self) -> _OrFilters:
104
+ rule = self._build_rule()
105
+ if rule:
106
+ return [{"and": [rule]}]
107
+ else:
108
+ # Our boolean algebra logic requires something here - returning [] would cause errors.
109
+ return FilterDsl.true().compile()
85
110
 
86
111
 
87
112
  class _PlatformFilter(_BaseFilter):
@@ -157,10 +182,10 @@ class _EnvFilter(_BaseFilter):
157
182
 
158
183
 
159
184
  class _CustomCondition(_BaseFilter):
160
- """Represents a single field condition"""
185
+ """Represents a single field condition."""
161
186
 
162
187
  field: str
163
- condition: str
188
+ condition: FilterOperator
164
189
  values: List[str]
165
190
 
166
191
  def compile(self) -> _OrFilters:
@@ -173,7 +198,7 @@ class _CustomCondition(_BaseFilter):
173
198
 
174
199
 
175
200
  class _And(_BaseFilter):
176
- """Represents an AND conjunction of filters"""
201
+ """Represents an AND conjunction of filters."""
177
202
 
178
203
  and_: Sequence["Filter"] = pydantic.Field(alias="and")
179
204
  # TODO: Add validator to ensure that the "and" field is not empty
@@ -219,9 +244,14 @@ class _And(_BaseFilter):
219
244
  ]
220
245
  }
221
246
 
247
+ def dfs(self) -> Iterator[_BaseFilter]:
248
+ yield self
249
+ for filter in self.and_:
250
+ yield from filter.dfs()
251
+
222
252
 
223
253
  class _Or(_BaseFilter):
224
- """Represents an OR conjunction of filters"""
254
+ """Represents an OR conjunction of filters."""
225
255
 
226
256
  or_: Sequence["Filter"] = pydantic.Field(alias="or")
227
257
  # TODO: Add validator to ensure that the "or" field is not empty
@@ -232,9 +262,14 @@ class _Or(_BaseFilter):
232
262
  merged_filter.extend(filter.compile())
233
263
  return merged_filter
234
264
 
265
+ def dfs(self) -> Iterator[_BaseFilter]:
266
+ yield self
267
+ for filter in self.or_:
268
+ yield from filter.dfs()
269
+
235
270
 
236
271
  class _Not(_BaseFilter):
237
- """Represents a NOT filter"""
272
+ """Represents a NOT filter."""
238
273
 
239
274
  not_: "Filter" = pydantic.Field(alias="not")
240
275
 
@@ -262,6 +297,10 @@ class _Not(_BaseFilter):
262
297
 
263
298
  return final_filters
264
299
 
300
+ def dfs(self) -> Iterator[_BaseFilter]:
301
+ yield self
302
+ yield from self.not_.dfs()
303
+
265
304
 
266
305
  # TODO: With pydantic 2, we can use a RootModel with a
267
306
  # discriminated union to make the error messages more informative.
@@ -271,6 +310,7 @@ Filter = Union[
271
310
  _Not,
272
311
  _EntityTypeFilter,
273
312
  _EntitySubtypeFilter,
313
+ _StatusFilter,
274
314
  _PlatformFilter,
275
315
  _DomainFilter,
276
316
  _EnvFilter,
@@ -318,6 +358,18 @@ class FilterDsl:
318
358
  def not_(arg: "Filter") -> _Not:
319
359
  return _Not(not_=arg)
320
360
 
361
+ @staticmethod
362
+ def true() -> "Filter":
363
+ return _CustomCondition(
364
+ field="urn",
365
+ condition="EXISTS",
366
+ values=[],
367
+ )
368
+
369
+ @staticmethod
370
+ def false() -> "Filter":
371
+ return FilterDsl.not_(FilterDsl.true())
372
+
321
373
  @staticmethod
322
374
  def entity_type(
323
375
  entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -329,14 +381,15 @@ class FilterDsl:
329
381
  )
330
382
 
331
383
  @staticmethod
332
- def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
384
+ def entity_subtype(
385
+ entity_subtype: Union[str, Sequence[str]],
386
+ ) -> _EntitySubtypeFilter:
333
387
  return _EntitySubtypeFilter(
334
- entity_type=entity_type,
335
- entity_subtype=subtype,
388
+ entity_subtype=entity_subtype,
336
389
  )
337
390
 
338
391
  @staticmethod
339
- def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
392
+ def platform(platform: Union[str, Sequence[str]], /) -> _PlatformFilter:
340
393
  return _PlatformFilter(
341
394
  platform=[platform] if isinstance(platform, str) else platform
342
395
  )
@@ -344,11 +397,11 @@ class FilterDsl:
344
397
  # TODO: Add a platform_instance filter
345
398
 
346
399
  @staticmethod
347
- def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
400
+ def domain(domain: Union[str, Sequence[str]], /) -> _DomainFilter:
348
401
  return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
349
402
 
350
403
  @staticmethod
351
- def env(env: Union[str, List[str]], /) -> _EnvFilter:
404
+ def env(env: Union[str, Sequence[str]], /) -> _EnvFilter:
352
405
  return _EnvFilter(env=[env] if isinstance(env, str) else env)
353
406
 
354
407
  @staticmethod
@@ -359,13 +412,17 @@ class FilterDsl:
359
412
  values=[f"{key}={value}"],
360
413
  )
361
414
 
415
+ @staticmethod
416
+ def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
417
+ return _StatusFilter(status=status)
418
+
362
419
  # TODO: Add a soft-deletion status filter
363
420
  # TODO: add a container / browse path filter
364
421
  # TODO add shortcut for custom filters
365
422
 
366
423
  @staticmethod
367
424
  def custom_filter(
368
- field: str, condition: str, values: List[str]
425
+ field: str, condition: FilterOperator, values: Sequence[str]
369
426
  ) -> _CustomCondition:
370
427
  return _CustomCondition(
371
428
  field=field,
@@ -1,7 +1,9 @@
1
+ import logging
1
2
  import re
2
3
  from enum import Enum
3
4
  from typing import Iterator, List, Tuple
4
5
 
6
+ logger = logging.getLogger(__name__)
5
7
  SELECT_KEYWORD = "SELECT"
6
8
  CASE_KEYWORD = "CASE"
7
9
  END_KEYWORD = "END"
@@ -120,7 +122,9 @@ class _StatementSplitter:
120
122
  # Reset current_statement-specific state.
121
123
  self.does_select_mean_new_statement = False
122
124
  if self.current_case_statements != 0:
123
- breakpoint()
125
+ logger.warning(
126
+ f"Unexpected END keyword. Current case statements: {self.current_case_statements}"
127
+ )
124
128
  self.current_case_statements = 0
125
129
 
126
130
  def process(self) -> Iterator[str]:
@@ -233,8 +237,10 @@ class _StatementSplitter:
233
237
  ),
234
238
  )
235
239
  if (
236
- is_force_new_statement_keyword and most_recent_real_char != ")"
237
- ): # usually we'd have a close paren that closes a CTE
240
+ is_force_new_statement_keyword
241
+ and not self._has_preceding_cte(most_recent_real_char)
242
+ and not self._is_part_of_merge_query()
243
+ ):
238
244
  # Force termination of current statement
239
245
  yield from self._yield_if_complete()
240
246
 
@@ -247,6 +253,14 @@ class _StatementSplitter:
247
253
  else:
248
254
  self.current_statement.append(c)
249
255
 
256
+ def _has_preceding_cte(self, most_recent_real_char: str) -> bool:
257
+ # usually we'd have a close paren that closes a CTE
258
+ return most_recent_real_char == ")"
259
+
260
+ def _is_part_of_merge_query(self) -> bool:
261
+ # In merge statement we'd have `when matched then` or `when not matched then"
262
+ return "".join(self.current_statement).strip().lower().endswith("then")
263
+
250
264
 
251
265
  def split_statements(sql: str) -> Iterator[str]:
252
266
  """
@@ -30,6 +30,7 @@ from datahub.metadata.urns import (
30
30
  DatasetUrn,
31
31
  QueryUrn,
32
32
  SchemaFieldUrn,
33
+ Urn,
33
34
  )
34
35
  from datahub.sql_parsing.schema_resolver import (
35
36
  SchemaResolver,
@@ -139,6 +140,8 @@ class QueryMetadata:
139
140
 
140
141
  used_temp_tables: bool = True
141
142
 
143
+ origin: Optional[Urn] = None
144
+
142
145
  def make_created_audit_stamp(self) -> models.AuditStampClass:
143
146
  return models.AuditStampClass(
144
147
  time=make_ts_millis(self.latest_timestamp) or 0,
@@ -221,6 +224,7 @@ class PreparsedQuery:
221
224
  )
222
225
  # Use this to store addtitional key-value information about query for debugging
223
226
  extra_info: Optional[dict] = None
227
+ origin: Optional[Urn] = None
224
228
 
225
229
 
226
230
  @dataclasses.dataclass
@@ -903,6 +907,7 @@ class SqlParsingAggregator(Closeable):
903
907
  column_usage=parsed.column_usage or {},
904
908
  confidence_score=parsed.confidence_score,
905
909
  used_temp_tables=session_has_temp_tables,
910
+ origin=parsed.origin,
906
911
  )
907
912
  )
908
913
 
@@ -1464,6 +1469,7 @@ class SqlParsingAggregator(Closeable):
1464
1469
  source=models.QuerySourceClass.SYSTEM,
1465
1470
  created=query.make_created_audit_stamp(),
1466
1471
  lastModified=query.make_last_modified_audit_stamp(),
1472
+ origin=query.origin.urn() if query.origin else None,
1467
1473
  ),
1468
1474
  models.QuerySubjectsClass(
1469
1475
  subjects=[
@@ -13,7 +13,7 @@ from datahub.api.entities.platformresource.platform_resource import (
13
13
  )
14
14
  from datahub.ingestion.api.report import Report
15
15
  from datahub.ingestion.graph.client import DataHubGraph
16
- from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
16
+ from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn, DataPlatformUrn, Urn
17
17
  from datahub.utilities.search_utils import LogicalOperator
18
18
  from datahub.utilities.stats_collections import int_top_k_dict
19
19
 
@@ -21,6 +21,10 @@ UrnStr = str
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
+ MODE_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:mode")
25
+ LOOKER_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:looker")
26
+ HEX_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:hex")
27
+
24
28
 
25
29
  class QueryLog(Protocol):
26
30
  """Represents Query Log Entry
@@ -30,6 +34,7 @@ class QueryLog(Protocol):
30
34
  query_text: str
31
35
  user: Optional[Union[CorpUserUrn, CorpGroupUrn]]
32
36
  extra_info: Optional[dict]
37
+ origin: Optional[Urn]
33
38
 
34
39
 
35
40
  def _get_last_line(query: str) -> str:
@@ -67,6 +72,10 @@ class ToolMetaExtractor:
67
72
  "looker",
68
73
  self._extract_looker_query,
69
74
  ),
75
+ (
76
+ "hex",
77
+ self._extract_hex_query,
78
+ ),
70
79
  ]
71
80
  # maps user id (as string) to email address
72
81
  self.looker_user_mapping = looker_user_mapping
@@ -153,7 +162,7 @@ class ToolMetaExtractor:
153
162
  entry.extra_info = entry.extra_info or {}
154
163
  entry.extra_info["user_via"] = original_user
155
164
 
156
- # TODO: Generate an "origin" urn.
165
+ entry.origin = MODE_PLATFORM_URN
157
166
 
158
167
  return True
159
168
 
@@ -190,6 +199,22 @@ class ToolMetaExtractor:
190
199
  entry.extra_info = entry.extra_info or {}
191
200
  entry.extra_info["user_via"] = original_user
192
201
 
202
+ entry.origin = LOOKER_PLATFORM_URN
203
+
204
+ return True
205
+
206
+ def _extract_hex_query(self, entry: QueryLog) -> bool:
207
+ """
208
+ Returns:
209
+ bool: whether QueryLog entry is that of hex.
210
+ """
211
+ last_line = _get_last_line(entry.query_text)
212
+
213
+ if not last_line.startswith("-- Hex query metadata:"):
214
+ return False
215
+
216
+ entry.origin = HEX_PLATFORM_URN
217
+
193
218
  return True
194
219
 
195
220
  def extract_bi_metadata(self, entry: QueryLog) -> bool:
@@ -8,7 +8,6 @@ import deepdiff.serialization
8
8
  import yaml
9
9
  from deepdiff import DeepDiff
10
10
  from deepdiff.model import DiffLevel
11
- from deepdiff.operator import BaseOperator
12
11
  from typing_extensions import Literal
13
12
 
14
13
  ReportType = Literal[
@@ -59,27 +58,12 @@ class AspectForDiff:
59
58
 
60
59
  @dataclasses.dataclass
61
60
  class DeltaInfo:
62
- """Information about an MCP used to construct a diff delta.
63
-
64
- In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
65
- """
61
+ """Information about an MCP used to construct a diff delta."""
66
62
 
67
63
  idx: int # Location in list of MCEs in golden file
68
64
  original: Dict[str, Any] # Original json-serialized MCP
69
65
 
70
66
 
71
- class DeltaInfoOperator(BaseOperator):
72
- """Warning: Doesn't seem to be working right now.
73
- Ignored via an ignore path as an extra layer of defense.
74
- """
75
-
76
- def __init__(self):
77
- super().__init__(types=[DeltaInfo])
78
-
79
- def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
80
- return True
81
-
82
-
83
67
  AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
84
68
 
85
69
 
@@ -176,7 +160,6 @@ class MCPDiff:
176
160
  t2=t2,
177
161
  exclude_regex_paths=ignore_paths,
178
162
  ignore_order=True,
179
- custom_operators=[DeltaInfoOperator()],
180
163
  )
181
164
  if diff:
182
165
  aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)
@@ -1,7 +1,15 @@
1
1
  import concurrent.futures
2
2
  import contextlib
3
3
  import queue
4
- from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ Iterable,
8
+ Iterator,
9
+ Optional,
10
+ Tuple,
11
+ TypeVar,
12
+ )
5
13
 
6
14
  T = TypeVar("T")
7
15
 
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
18
26
  worker_func: Callable[..., Iterable[T]],
19
27
  args_list: Iterable[Tuple[Any, ...]],
20
28
  max_workers: int,
21
- ) -> Generator[T, None, None]:
22
- out_q: queue.Queue[T] = queue.Queue()
29
+ max_backpressure: Optional[int] = None,
30
+ ) -> Iterator[T]:
31
+ if max_backpressure is None:
32
+ max_backpressure = 10 * max_workers
33
+ assert max_backpressure >= max_workers
34
+
35
+ out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
23
36
 
24
37
  def _worker_wrapper(
25
38
  worker_func: Callable[..., Iterable[T]], *args: Any