acryl-datahub 1.0.0.1rc6__py3-none-any.whl → 1.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +81 -79
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/datajob/dataflow.py +15 -0
  5. datahub/api/entities/datajob/datajob.py +17 -0
  6. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  7. datahub/api/entities/dataset/dataset.py +2 -2
  8. datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
  9. datahub/cli/ingest_cli.py +4 -4
  10. datahub/cli/migrate.py +6 -6
  11. datahub/configuration/common.py +1 -1
  12. datahub/emitter/mcp_builder.py +4 -0
  13. datahub/errors.py +4 -0
  14. datahub/ingestion/api/common.py +9 -0
  15. datahub/ingestion/api/source.py +6 -2
  16. datahub/ingestion/api/source_helpers.py +35 -2
  17. datahub/ingestion/graph/client.py +122 -7
  18. datahub/ingestion/graph/filters.py +41 -16
  19. datahub/ingestion/run/pipeline.py +0 -6
  20. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  21. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  22. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  23. datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
  24. datahub/ingestion/source/fivetran/fivetran.py +1 -0
  25. datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
  26. datahub/ingestion/source/hex/constants.py +5 -0
  27. datahub/ingestion/source/hex/hex.py +150 -22
  28. datahub/ingestion/source/hex/mapper.py +28 -2
  29. datahub/ingestion/source/hex/model.py +10 -2
  30. datahub/ingestion/source/hex/query_fetcher.py +300 -0
  31. datahub/ingestion/source/iceberg/iceberg.py +106 -18
  32. datahub/ingestion/source/kafka/kafka.py +1 -4
  33. datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
  34. datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
  35. datahub/ingestion/source/looker/looker_source.py +2 -3
  36. datahub/ingestion/source/mlflow.py +6 -7
  37. datahub/ingestion/source/mode.py +2 -2
  38. datahub/ingestion/source/nifi.py +3 -3
  39. datahub/ingestion/source/openapi.py +3 -3
  40. datahub/ingestion/source/openapi_parser.py +8 -8
  41. datahub/ingestion/source/powerbi/config.py +1 -1
  42. datahub/ingestion/source/powerbi/powerbi.py +16 -3
  43. datahub/ingestion/source/redshift/profile.py +2 -2
  44. datahub/ingestion/source/sigma/sigma.py +6 -2
  45. datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
  46. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  47. datahub/ingestion/source/sql/trino.py +4 -3
  48. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  49. datahub/ingestion/source/superset.py +108 -81
  50. datahub/ingestion/source/tableau/tableau.py +4 -4
  51. datahub/ingestion/source/tableau/tableau_common.py +2 -2
  52. datahub/ingestion/source/unity/source.py +1 -1
  53. datahub/ingestion/source/vertexai/vertexai.py +7 -7
  54. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  55. datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
  56. datahub/ingestion/transformer/dataset_domain.py +1 -1
  57. datahub/lite/lite_util.py +2 -2
  58. datahub/metadata/_schema_classes.py +47 -2
  59. datahub/metadata/_urns/urn_defs.py +56 -0
  60. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  61. datahub/metadata/schema.avsc +121 -85
  62. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  63. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  64. datahub/metadata/schemas/FormInfo.avsc +5 -0
  65. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  66. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  67. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  68. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  69. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  70. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  71. datahub/sdk/search_client.py +81 -8
  72. datahub/sdk/search_filters.py +73 -11
  73. datahub/testing/mcp_diff.py +1 -1
  74. datahub/utilities/file_backed_collections.py +6 -6
  75. datahub/utilities/hive_schema_to_avro.py +2 -2
  76. datahub/utilities/ingest_utils.py +2 -2
  77. datahub/utilities/threaded_iterator_executor.py +16 -3
  78. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  79. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.0.0.1rc6.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,10 @@ from __future__ import annotations
3
3
  import abc
4
4
  from typing import (
5
5
  Any,
6
+ ClassVar,
7
+ Iterator,
6
8
  List,
9
+ Optional,
7
10
  Sequence,
8
11
  TypedDict,
9
12
  Union,
@@ -13,8 +16,13 @@ import pydantic
13
16
 
14
17
  from datahub.configuration.common import ConfigModel
15
18
  from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
- from datahub.ingestion.graph.client import entity_type_to_graphql
17
- from datahub.ingestion.graph.filters import FilterOperator, SearchFilterRule
19
+ from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
20
+ from datahub.ingestion.graph.filters import (
21
+ FilterOperator,
22
+ RemovedStatusFilter,
23
+ SearchFilterRule,
24
+ _get_status_filter,
25
+ )
18
26
  from datahub.metadata.schema_classes import EntityTypeName
19
27
  from datahub.metadata.urns import DataPlatformUrn, DomainUrn
20
28
 
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
37
45
  def compile(self) -> _OrFilters:
38
46
  pass
39
47
 
40
-
41
- def _flexible_entity_type_to_graphql(entity_type: str) -> str:
42
- if entity_type.upper() == entity_type:
43
- # Assume that we were passed a graphql EntityType enum value,
44
- # so no conversion is needed.
45
- return entity_type
46
- return entity_type_to_graphql(entity_type)
48
+ def dfs(self) -> Iterator[_BaseFilter]:
49
+ yield self
47
50
 
48
51
 
49
52
  class _EntityTypeFilter(_BaseFilter):
53
+ """Filter for specific entity types.
54
+
55
+ If no entity type filter is specified, we will search all entity types in the
56
+ default search set, mirroring the behavior of the DataHub UI.
57
+ """
58
+
59
+ ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
60
+
50
61
  entity_type: List[str] = pydantic.Field(
51
62
  description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
52
63
  )
53
64
 
54
65
  def _build_rule(self) -> SearchFilterRule:
55
66
  return SearchFilterRule(
56
- field="_entityType",
67
+ field=self.ENTITY_TYPE_FIELD,
57
68
  condition="EQUAL",
58
- values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
69
+ values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
59
70
  )
60
71
 
61
72
  def compile(self) -> _OrFilters:
@@ -78,6 +89,26 @@ class _EntitySubtypeFilter(_BaseFilter):
78
89
  return [{"and": [self._build_rule()]}]
79
90
 
80
91
 
92
+ class _StatusFilter(_BaseFilter):
93
+ """Filter for the status of entities during search.
94
+
95
+ If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
96
+ """
97
+
98
+ status: RemovedStatusFilter
99
+
100
+ def _build_rule(self) -> Optional[SearchFilterRule]:
101
+ return _get_status_filter(self.status)
102
+
103
+ def compile(self) -> _OrFilters:
104
+ rule = self._build_rule()
105
+ if rule:
106
+ return [{"and": [rule]}]
107
+ else:
108
+ # Our boolean algebra logic requires something here - returning [] would cause errors.
109
+ return FilterDsl.true().compile()
110
+
111
+
81
112
  class _PlatformFilter(_BaseFilter):
82
113
  platform: List[str]
83
114
  # TODO: Add validator to convert string -> list of strings
@@ -213,6 +244,11 @@ class _And(_BaseFilter):
213
244
  ]
214
245
  }
215
246
 
247
+ def dfs(self) -> Iterator[_BaseFilter]:
248
+ yield self
249
+ for filter in self.and_:
250
+ yield from filter.dfs()
251
+
216
252
 
217
253
  class _Or(_BaseFilter):
218
254
  """Represents an OR conjunction of filters."""
@@ -226,6 +262,11 @@ class _Or(_BaseFilter):
226
262
  merged_filter.extend(filter.compile())
227
263
  return merged_filter
228
264
 
265
+ def dfs(self) -> Iterator[_BaseFilter]:
266
+ yield self
267
+ for filter in self.or_:
268
+ yield from filter.dfs()
269
+
229
270
 
230
271
  class _Not(_BaseFilter):
231
272
  """Represents a NOT filter."""
@@ -256,6 +297,10 @@ class _Not(_BaseFilter):
256
297
 
257
298
  return final_filters
258
299
 
300
+ def dfs(self) -> Iterator[_BaseFilter]:
301
+ yield self
302
+ yield from self.not_.dfs()
303
+
259
304
 
260
305
  # TODO: With pydantic 2, we can use a RootModel with a
261
306
  # discriminated union to make the error messages more informative.
@@ -265,6 +310,7 @@ Filter = Union[
265
310
  _Not,
266
311
  _EntityTypeFilter,
267
312
  _EntitySubtypeFilter,
313
+ _StatusFilter,
268
314
  _PlatformFilter,
269
315
  _DomainFilter,
270
316
  _EnvFilter,
@@ -312,6 +358,18 @@ class FilterDsl:
312
358
  def not_(arg: "Filter") -> _Not:
313
359
  return _Not(not_=arg)
314
360
 
361
+ @staticmethod
362
+ def true() -> "Filter":
363
+ return _CustomCondition(
364
+ field="urn",
365
+ condition="EXISTS",
366
+ values=[],
367
+ )
368
+
369
+ @staticmethod
370
+ def false() -> "Filter":
371
+ return FilterDsl.not_(FilterDsl.true())
372
+
315
373
  @staticmethod
316
374
  def entity_type(
317
375
  entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
@@ -354,6 +412,10 @@ class FilterDsl:
354
412
  values=[f"{key}={value}"],
355
413
  )
356
414
 
415
+ @staticmethod
416
+ def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
417
+ return _StatusFilter(status=status)
418
+
357
419
  # TODO: Add a soft-deletion status filter
358
420
  # TODO: add a container / browse path filter
359
421
  # TODO add shortcut for custom filters
@@ -189,7 +189,7 @@ class MCPDiff:
189
189
  """
190
190
  aspect_diffs = [v for d in self.aspect_changes.values() for v in d.values()]
191
191
  for aspect_diff in aspect_diffs:
192
- for _, old, new in aspect_diff.aspects_changed.keys():
192
+ for _, old, new in aspect_diff.aspects_changed:
193
193
  golden[old.delta_info.idx] = new.delta_info.original
194
194
 
195
195
  indices_to_remove = set()
@@ -250,7 +250,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
250
250
  rowid INTEGER PRIMARY KEY AUTOINCREMENT,
251
251
  key TEXT UNIQUE,
252
252
  value BLOB
253
- {"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
253
+ {"".join(f", {column_name} BLOB" for column_name in self.extra_columns)}
254
254
  )"""
255
255
  )
256
256
 
@@ -267,7 +267,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
267
267
  if self.indexes_created:
268
268
  return
269
269
  # The key column will automatically be indexed, but we need indexes for the extra columns.
270
- for column_name in self.extra_columns.keys():
270
+ for column_name in self.extra_columns:
271
271
  self._conn.execute(
272
272
  f"CREATE INDEX {self.tablename}_{column_name} ON {self.tablename} ({column_name})"
273
273
  )
@@ -305,12 +305,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
305
305
  f"""INSERT INTO {self.tablename} (
306
306
  key,
307
307
  value
308
- {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
308
+ {"".join(f", {column_name}" for column_name in self.extra_columns)}
309
309
  )
310
310
  VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
311
311
  ON CONFLICT (key) DO UPDATE SET
312
312
  value = excluded.value
313
- {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
313
+ {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns)}
314
314
  """,
315
315
  items_to_write,
316
316
  )
@@ -321,7 +321,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
321
321
  f"""INSERT INTO {self.tablename} (
322
322
  key,
323
323
  value
324
- {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
324
+ {"".join(f", {column_name}" for column_name in self.extra_columns)}
325
325
  )
326
326
  VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
327
327
  item,
@@ -330,7 +330,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
330
330
  self._conn.execute(
331
331
  f"""UPDATE {self.tablename} SET
332
332
  value = ?
333
- {"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
333
+ {"".join(f", {column_name} = ?" for column_name in self.extra_columns)}
334
334
  WHERE key = ?""",
335
335
  (*item[1:], item[0]),
336
336
  )
@@ -155,7 +155,7 @@ class HiveColumnToAvroConverter:
155
155
 
156
156
  @staticmethod
157
157
  def _parse_basic_datatype_string(s: str) -> Dict[str, object]:
158
- if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE.keys():
158
+ if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE:
159
159
  return {
160
160
  "type": HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE[s],
161
161
  "native_data_type": s,
@@ -218,7 +218,7 @@ class HiveColumnToAvroConverter:
218
218
  buf = ""
219
219
  level = 0
220
220
  for c in s:
221
- if c in HiveColumnToAvroConverter._BRACKETS.keys():
221
+ if c in HiveColumnToAvroConverter._BRACKETS:
222
222
  level += 1
223
223
  buf += c
224
224
  elif c in HiveColumnToAvroConverter._BRACKETS.values():
@@ -32,10 +32,10 @@ def deploy_source_vars(
32
32
  name: Optional[str],
33
33
  config: str,
34
34
  urn: Optional[str],
35
- executor_id: str,
35
+ executor_id: Optional[str],
36
36
  cli_version: Optional[str],
37
37
  schedule: Optional[str],
38
- time_zone: str,
38
+ time_zone: Optional[str],
39
39
  extra_pip: Optional[str],
40
40
  debug: bool = False,
41
41
  ) -> dict:
@@ -1,7 +1,15 @@
1
1
  import concurrent.futures
2
2
  import contextlib
3
3
  import queue
4
- from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar
4
+ from typing import (
5
+ Any,
6
+ Callable,
7
+ Iterable,
8
+ Iterator,
9
+ Optional,
10
+ Tuple,
11
+ TypeVar,
12
+ )
5
13
 
6
14
  T = TypeVar("T")
7
15
 
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
18
26
  worker_func: Callable[..., Iterable[T]],
19
27
  args_list: Iterable[Tuple[Any, ...]],
20
28
  max_workers: int,
21
- ) -> Generator[T, None, None]:
22
- out_q: queue.Queue[T] = queue.Queue()
29
+ max_backpressure: Optional[int] = None,
30
+ ) -> Iterator[T]:
31
+ if max_backpressure is None:
32
+ max_backpressure = 10 * max_workers
33
+ assert max_backpressure >= max_workers
34
+
35
+ out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
23
36
 
24
37
  def _worker_wrapper(
25
38
  worker_func: Callable[..., Iterable[T]], *args: Any
@@ -1,45 +0,0 @@
1
- import functools
2
- from typing import Iterable
3
-
4
- from datahub.emitter.mce_builder import get_sys_time
5
- from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
6
- from datahub.ingestion.api.transform import Transformer
7
- from datahub.ingestion.api.workunit import MetadataWorkUnit
8
- from datahub.ingestion.transformer.auto_helper_transformer import AutoHelperTransformer
9
- from datahub.metadata.schema_classes import SystemMetadataClass
10
-
11
-
12
- def auto_system_metadata(
13
- ctx: PipelineContext,
14
- stream: Iterable[MetadataWorkUnit],
15
- ) -> Iterable[MetadataWorkUnit]:
16
- if not ctx.pipeline_config:
17
- raise ValueError("Pipeline config is required for system metadata")
18
- set_system_metadata = ctx.pipeline_config.flags.set_system_metadata
19
- set_pipeline_name = ctx.pipeline_config.flags.set_system_metadata_pipeline_name
20
-
21
- for workunit in stream:
22
- if set_system_metadata:
23
- workunit.metadata.systemMetadata = SystemMetadataClass(
24
- lastObserved=get_sys_time(), runId=ctx.run_id
25
- )
26
- if set_pipeline_name:
27
- workunit.metadata.systemMetadata.pipelineName = ctx.pipeline_name
28
-
29
- yield workunit
30
-
31
-
32
- class SystemMetadataTransformer(Transformer):
33
- def __init__(self, ctx: PipelineContext):
34
- self._inner_transformer = AutoHelperTransformer(
35
- functools.partial(auto_system_metadata, ctx)
36
- )
37
-
38
- def transform(
39
- self, record_envelopes: Iterable[RecordEnvelope]
40
- ) -> Iterable[RecordEnvelope]:
41
- yield from self._inner_transformer.transform(record_envelopes)
42
-
43
- @classmethod
44
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Transformer:
45
- raise NotImplementedError(f"{cls.__name__} cannot be created from config")