acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
16
16
  )
17
17
  from datahub.configuration.source_common import EnvConfigMixin
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
- from datahub.emitter.mcp_builder import ContainerKey
19
+ from datahub.emitter.mcp_builder import ExperimentKey
20
20
  from datahub.ingestion.api.common import PipelineContext
21
21
  from datahub.ingestion.api.decorators import (
22
22
  SupportStatus,
@@ -77,10 +77,6 @@ from datahub.sdk.dataset import Dataset
77
77
  T = TypeVar("T")
78
78
 
79
79
 
80
- class ContainerKeyWithId(ContainerKey):
81
- id: str
82
-
83
-
84
80
  class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
81
  tracking_uri: Optional[str] = Field(
86
82
  default=None,
@@ -252,7 +248,7 @@ class MLflowSource(StatefulIngestionSourceBase):
252
248
  self, experiment: Experiment
253
249
  ) -> Iterable[MetadataWorkUnit]:
254
250
  experiment_container = Container(
255
- container_key=ContainerKeyWithId(
251
+ container_key=ExperimentKey(
256
252
  platform=str(DataPlatformUrn(platform_name=self.platform)),
257
253
  id=experiment.name,
258
254
  ),
@@ -470,7 +466,7 @@ class MLflowSource(StatefulIngestionSourceBase):
470
466
  def _get_run_workunits(
471
467
  self, experiment: Experiment, run: Run
472
468
  ) -> Iterable[MetadataWorkUnit]:
473
- experiment_key = ContainerKeyWithId(
469
+ experiment_key = ExperimentKey(
474
470
  platform=str(DataPlatformUrn(self.platform)), id=experiment.name
475
471
  )
476
472
 
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
94
94
  UpstreamLineageClass,
95
95
  ViewPropertiesClass,
96
96
  )
97
- from datahub.metadata.urns import ChartUrn
97
+ from datahub.metadata.urns import ChartUrn, DatasetUrn
98
98
  from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
99
99
  from datahub.utilities.dedup_list import deduplicate_list
100
100
  from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
@@ -1083,6 +1083,7 @@ class Mapper:
1083
1083
  report: powerbi_data_classes.Report,
1084
1084
  chart_mcps: List[MetadataChangeProposalWrapper],
1085
1085
  user_mcps: List[MetadataChangeProposalWrapper],
1086
+ dataset_edges: List[EdgeClass],
1086
1087
  ) -> List[MetadataChangeProposalWrapper]:
1087
1088
  """
1088
1089
  Map PowerBi report to Datahub dashboard
@@ -1104,6 +1105,7 @@ class Mapper:
1104
1105
  charts=chart_urn_list,
1105
1106
  lastModified=ChangeAuditStamps(),
1106
1107
  dashboardUrl=report.webUrl,
1108
+ datasetEdges=dataset_edges,
1107
1109
  )
1108
1110
 
1109
1111
  info_mcp = self.new_mcp(
@@ -1197,12 +1199,23 @@ class Mapper:
1197
1199
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1198
1200
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1199
1201
 
1202
+ # collect all upstream datasets; using a set to retain unique urns
1203
+ dataset_urns = {
1204
+ dataset.entityUrn
1205
+ for dataset in ds_mcps
1206
+ if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
1207
+ }
1208
+ dataset_edges = [
1209
+ EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
1210
+ ]
1211
+
1200
1212
  # Let's convert report to datahub dashboard
1201
1213
  report_mcps = self.report_to_dashboard(
1202
1214
  workspace=workspace,
1203
1215
  report=report,
1204
1216
  chart_mcps=chart_mcps,
1205
1217
  user_mcps=user_mcps,
1218
+ dataset_edges=dataset_edges,
1206
1219
  )
1207
1220
 
1208
1221
  # Now add MCPs in sequence
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
128
128
  if catalog_name is None:
129
129
  raise exc.NoSuchTableError("catalog is required in connection")
130
130
  connector_name = get_catalog_connector_name(connection.engine, catalog_name)
131
- if connector_name is None:
132
- return {}
133
- if connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS:
131
+ if (
132
+ connector_name is not None
133
+ and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
134
+ ):
134
135
  properties_table = self._get_full_table(f"{table_name}$properties", schema)
135
136
  query = f"SELECT * FROM {properties_table}"
136
137
  row = connection.execute(sql.text(query)).fetchone()
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
45
45
  description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
46
46
  le=100.0,
47
47
  ge=0.0,
48
- hidden_from_docs=True,
49
48
  )
50
49
 
51
50
 
@@ -22,7 +22,11 @@ from google.oauth2 import service_account
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
25
- from datahub.emitter.mcp_builder import ContainerKey, ProjectIdKey, gen_containers
25
+ from datahub.emitter.mcp_builder import (
26
+ ExperimentKey,
27
+ ProjectIdKey,
28
+ gen_containers,
29
+ )
26
30
  from datahub.ingestion.api.common import PipelineContext
27
31
  from datahub.ingestion.api.decorators import (
28
32
  SupportStatus,
@@ -96,10 +100,6 @@ class ModelMetadata:
96
100
  endpoints: Optional[List[Endpoint]] = None
97
101
 
98
102
 
99
- class ContainerKeyWithId(ContainerKey):
100
- id: str
101
-
102
-
103
103
  @platform_name("Vertex AI", id="vertexai")
104
104
  @config_class(VertexAIConfig)
105
105
  @support_status(SupportStatus.TESTING)
@@ -173,7 +173,7 @@ class VertexAISource(Source):
173
173
  ) -> Iterable[MetadataWorkUnit]:
174
174
  yield from gen_containers(
175
175
  parent_container_key=self._get_project_container(),
176
- container_key=ContainerKeyWithId(
176
+ container_key=ExperimentKey(
177
177
  platform=self.platform,
178
178
  id=self._make_vertexai_experiment_name(experiment.name),
179
179
  ),
@@ -309,7 +309,7 @@ class VertexAISource(Source):
309
309
  def _gen_experiment_run_mcps(
310
310
  self, experiment: Experiment, run: ExperimentRun
311
311
  ) -> Iterable[MetadataChangeProposalWrapper]:
312
- experiment_key = ContainerKeyWithId(
312
+ experiment_key = ExperimentKey(
313
313
  platform=self.platform,
314
314
  id=self._make_vertexai_experiment_name(experiment.name),
315
315
  )
@@ -2,36 +2,106 @@ from __future__ import annotations
2
2
 
3
3
  from typing import (
4
4
  TYPE_CHECKING,
5
- Dict,
6
5
  Iterable,
7
6
  List,
8
7
  Optional,
8
+ Tuple,
9
+ Type,
10
+ TypeVar,
9
11
  )
10
12
 
11
- from datahub.ingestion.graph.filters import RawSearchFilterRule
13
+ from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
12
14
  from datahub.metadata.urns import Urn
13
- from datahub.sdk.search_filters import Filter
15
+ from datahub.sdk.search_filters import (
16
+ Filter,
17
+ FilterDsl,
18
+ _EntityTypeFilter,
19
+ _OrFilters,
20
+ _StatusFilter,
21
+ )
14
22
 
15
23
  if TYPE_CHECKING:
16
24
  from datahub.sdk.main_client import DataHubClient
17
25
 
18
26
 
27
+ _FilterType = TypeVar("_FilterType", bound=Filter)
28
+
29
+
30
+ def _typed_dfs(
31
+ filter: Optional[_FilterType], type: Type[_FilterType]
32
+ ) -> Optional[List[_FilterType]]:
33
+ if filter is None:
34
+ return None
35
+
36
+ found: Optional[List[_FilterType]] = None
37
+ for f in filter.dfs():
38
+ if isinstance(f, type):
39
+ if found is None:
40
+ found = []
41
+ found.append(f)
42
+ return found
43
+
44
+
19
45
  def compile_filters(
20
46
  filter: Optional[Filter],
21
- ) -> Optional[List[Dict[str, List[RawSearchFilterRule]]]]:
47
+ ) -> Tuple[Optional[List[str]], RawSearchFilter]:
22
48
  # TODO: Not every filter type is supported for every entity type.
23
49
  # If we can detect issues with the filters at compile time, we should
24
50
  # raise an error.
25
51
 
26
- if filter is None:
27
- return None
52
+ existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
53
+ if existing_soft_deleted_filter is None:
54
+ soft_deleted_filter = FilterDsl.soft_deleted(
55
+ RemovedStatusFilter.NOT_SOFT_DELETED
56
+ )
57
+ if filter is None:
58
+ filter = soft_deleted_filter
59
+ else:
60
+ filter = FilterDsl.and_(filter, soft_deleted_filter)
61
+
62
+ # This should be safe - if filter were None coming in, then we would replace it
63
+ # with the soft-deleted filter.
64
+ assert filter is not None
28
65
 
29
66
  initial_filters = filter.compile()
30
- return [
67
+
68
+ compiled_filters: RawSearchFilter = [
31
69
  {"and": [rule.to_raw() for rule in andClause["and"]]}
32
70
  for andClause in initial_filters
33
71
  ]
34
72
 
73
+ entity_types = compute_entity_types(initial_filters)
74
+
75
+ return entity_types, compiled_filters
76
+
77
+
78
+ def compute_entity_types(
79
+ filters: _OrFilters,
80
+ ) -> Optional[List[str]]:
81
+ found_filters = False
82
+ found_positive_filters = False
83
+ entity_types: List[str] = []
84
+ for ands in filters:
85
+ for clause in ands["and"]:
86
+ if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
87
+ found_filters = True
88
+ if not clause.negated:
89
+ found_positive_filters = True
90
+
91
+ entity_types.extend(clause.values)
92
+
93
+ if not found_filters:
94
+ # If we didn't find any filters, use None so we use the default set.
95
+ return None
96
+
97
+ if not found_positive_filters:
98
+ # If we only found negated filters, then it's probably a query like
99
+ # "find me all entities except for dashboards". In that case, we
100
+ # still want to use the default set.
101
+ return None
102
+
103
+ return entity_types
104
+
35
105
 
36
106
  class SearchClient:
37
107
  def __init__(self, client: DataHubClient):
@@ -43,8 +113,11 @@ class SearchClient:
43
113
  filter: Optional[Filter] = None,
44
114
  ) -> Iterable[Urn]:
45
115
  # TODO: Add better limit / pagination support.
116
+ types, compiled_filters = compile_filters(filter)
46
117
  for urn in self._client._graph.get_urns_by_filter(
47
118
  query=query,
48
- extra_or_filters=compile_filters(filter),
119
+ status=None,
120
+ extra_or_filters=compiled_filters,
121
+ entity_types=types,
49
122
  ):
50
123
  yield Urn.from_string(urn)
@@ -3,7 +3,10 @@ from __future__ import annotations
3
3
  import abc
4
4
  from typing import (
5
5
  Any,
6
+ ClassVar,
7
+ Iterator,
6
8
  List,
9
+ Optional,
7
10
  Sequence,
8
11
  TypedDict,
9
12
  Union,
@@ -13,8 +16,13 @@ import pydantic
13
16
 
14
17
  from datahub.configuration.common import ConfigModel
15
18
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
- from datahub.ingestion.graph.client import entity_type_to_graphql
17
- from datahub.ingestion.graph.filters import FilterOperator, SearchFilterRule
19
+ from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
20
+ from datahub.ingestion.graph.filters import (
21
+ FilterOperator,
22
+ RemovedStatusFilter,
23
+ SearchFilterRule,
24
+ _get_status_filter,
25
+ )
18
26
  from datahub.metadata.schema_classes import EntityTypeName
19
27
  from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
28
 
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
37
45
  def compile(self) -> _OrFilters:
38
46
  pass
39
47
 
40
-
41
- def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
- if entity_type.upper() == entity_type:
43
- # Assume that we were passed a graphql EntityType enum value,
44
- # so no conversion is needed.
45
- return entity_type
46
- return entity_type_to_graphql(entity_type)
48
+ def dfs(self) -> Iterator[_BaseFilter]:
49
+ yield self
47
50
 
48
51
 
49
52
  class _EntityTypeFilter(_BaseFilter):
53
+ """Filter for specific entity types.
54
+
55
+ If no entity type filter is specified, we will search all entity types in the
56
+ default search set, mirroring the behavior of the DataHub UI.
57
+ """
58
+
59
+ ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
60
+
50
61
  entity_type: List[str] = pydantic.Field(
51
62
  description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
63
  )
53
64
 
54
65
  def _build_rule(self) -> SearchFilterRule:
55
66
  return SearchFilterRule(
56
- field="_entityType",
67
+ field=self.ENTITY_TYPE_FIELD,
57
68
  condition="EQUAL",
58
- values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
69
+ values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
70
  )
60
71
 
61
72
  def compile(self) -> _OrFilters:
@@ -78,6 +89,26 @@ class _EntitySubtypeFilter(_BaseFilter):
78
89
  return [{"and": [self._build_rule()]}]
79
90
 
80
91
 
92
+ class _StatusFilter(_BaseFilter):
93
+ """Filter for the status of entities during search.
94
+
95
+ If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
96
+ """
97
+
98
+ status: RemovedStatusFilter
99
+
100
+ def _build_rule(self) -> Optional[SearchFilterRule]:
101
+ return _get_status_filter(self.status)
102
+
103
+ def compile(self) -> _OrFilters:
104
+ rule = self._build_rule()
105
+ if rule:
106
+ return [{"and": [rule]}]
107
+ else:
108
+ # Our boolean algebra logic requires something here - returning [] would cause errors.
109
+ return FilterDsl.true().compile()
110
+
111
+
81
112
  class _PlatformFilter(_BaseFilter):
82
113
  platform: List[str]
83
114
  # TODO: Add validator to convert string -> list of strings
@@ -213,6 +244,11 @@ class _And(_BaseFilter):
213
244
  ]
214
245
  }
215
246
 
247
+ def dfs(self) -> Iterator[_BaseFilter]:
248
+ yield self
249
+ for filter in self.and_:
250
+ yield from filter.dfs()
251
+
216
252
 
217
253
  class _Or(_BaseFilter):
218
254
  """Represents an OR conjunction of filters."""
@@ -226,6 +262,11 @@ class _Or(_BaseFilter):
226
262
  merged_filter.extend(filter.compile())
227
263
  return merged_filter
228
264
 
265
+ def dfs(self) -> Iterator[_BaseFilter]:
266
+ yield self
267
+ for filter in self.or_:
268
+ yield from filter.dfs()
269
+
229
270
 
230
271
  class _Not(_BaseFilter):
231
272
  """Represents a NOT filter."""
@@ -256,6 +297,10 @@ class _Not(_BaseFilter):
256
297
 
257
298
  return final_filters
258
299
 
300
+ def dfs(self) -> Iterator[_BaseFilter]:
301
+ yield self
302
+ yield from self.not_.dfs()
303
+
259
304
 
260
305
  # TODO: With pydantic 2, we can use a RootModel with a
261
306
  # discriminated union to make the error messages more informative.
@@ -265,6 +310,7 @@ Filter = Union[
265
310
  _Not,
266
311
  _EntityTypeFilter,
267
312
  _EntitySubtypeFilter,
313
+ _StatusFilter,
268
314
  _PlatformFilter,
269
315
  _DomainFilter,
270
316
  _EnvFilter,
@@ -312,6 +358,18 @@ class FilterDsl:
312
358
  def not_(arg: "Filter") -> _Not:
313
359
  return _Not(not_=arg)
314
360
 
361
+ @staticmethod
362
+ def true() -> "Filter":
363
+ return _CustomCondition(
364
+ field="urn",
365
+ condition="EXISTS",
366
+ values=[],
367
+ )
368
+
369
+ @staticmethod
370
+ def false() -> "Filter":
371
+ return FilterDsl.not_(FilterDsl.true())
372
+
315
373
  @staticmethod
316
374
  def entity_type(
317
375
  entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -354,6 +412,10 @@ class FilterDsl:
354
412
  values=[f"{key}={value}"],
355
413
  )
356
414
 
415
+ @staticmethod
416
+ def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
417
+ return _StatusFilter(status=status)
418
+
357
419
  # TODO: Add a soft-deletion status filter
358
420
  # TODO: add a container / browse path filter
359
421
  # TODO add shortcut for custom filters
@@ -32,10 +32,10 @@ def deploy_source_vars(
32
32
  name: Optional[str],
33
33
  config: str,
34
34
  urn: Optional[str],
35
- executor_id: str,
35
+ executor_id: Optional[str],
36
36
  cli_version: Optional[str],
37
37
  schedule: Optional[str],
38
- time_zone: str,
38
+ time_zone: Optional[str],
39
39
  extra_pip: Optional[str],
40
40
  debug: bool = False,
41
41
  ) -> dict:
@@ -1,7 +1,15 @@
1
1
  import concurrent.futures
2
2
  import contextlib
3
3
  import queue
4
- from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ Iterable,
8
+ Iterator,
9
+ Optional,
10
+ Tuple,
11
+ TypeVar,
12
+ )
5
13
 
6
14
  T = TypeVar("T")
7
15
 
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
18
26
  worker_func: Callable[..., Iterable[T]],
19
27
  args_list: Iterable[Tuple[Any, ...]],
20
28
  max_workers: int,
21
- ) -> Generator[T, None, None]:
22
- out_q: queue.Queue[T] = queue.Queue()
29
+ max_backpressure: Optional[int] = None,
30
+ ) -> Iterator[T]:
31
+ if max_backpressure is None:
32
+ max_backpressure = 10 * max_workers
33
+ assert max_backpressure >= max_workers
34
+
35
+ out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
23
36
 
24
37
  def _worker_wrapper(
25
38
  worker_func: Callable[..., Iterable[T]], *args: Any