acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -357,6 +357,11 @@ class DBTCommonConfig(
357
357
  default=True,
358
358
  description="When enabled, includes the compiled code in the emitted metadata.",
359
359
  )
360
+ include_database_name: bool = Field(
361
+ default=True,
362
+ description="Whether to add database name to the table urn. "
363
+ "Set to False to skip it for engines like AWS Athena where it's not required.",
364
+ )
360
365
 
361
366
  @validator("target_platform")
362
367
  def validate_target_platform_value(cls, target_platform: str) -> str:
@@ -1028,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1028
1033
  cll_nodes.add(dbt_name)
1029
1034
  schema_nodes.add(dbt_name)
1030
1035
 
1031
- for dbt_name in all_nodes_map.keys():
1036
+ for dbt_name in all_nodes_map:
1032
1037
  if self._is_allowed_node(dbt_name):
1033
1038
  add_node_to_cll_list(dbt_name)
1034
1039
 
@@ -1769,10 +1774,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1769
1774
  logger.debug(
1770
1775
  f"Owner after applying owner extraction pattern:'{self.config.owner_extraction_pattern}' is '{owner}'."
1771
1776
  )
1772
- if isinstance(owner, list):
1773
- owners = owner
1774
- else:
1775
- owners = [owner]
1777
+ owners = owner if isinstance(owner, list) else [owner]
1778
+
1776
1779
  for owner in owners:
1777
1780
  if self.config.strip_user_ids_from_email:
1778
1781
  owner = owner.split("@")[0]
@@ -167,6 +167,7 @@ def extract_dbt_entities(
167
167
  use_identifiers: bool,
168
168
  tag_prefix: str,
169
169
  only_include_if_in_catalog: bool,
170
+ include_database_name: bool,
170
171
  report: DBTSourceReport,
171
172
  ) -> List[DBTNode]:
172
173
  sources_by_id = {x["unique_id"]: x for x in sources_results}
@@ -267,7 +268,7 @@ def extract_dbt_entities(
267
268
  dbt_name=key,
268
269
  dbt_adapter=manifest_adapter,
269
270
  dbt_package_name=manifest_node.get("package_name"),
270
- database=manifest_node["database"],
271
+ database=manifest_node["database"] if include_database_name else None,
271
272
  schema=manifest_node["schema"],
272
273
  name=name,
273
274
  alias=manifest_node.get("alias"),
@@ -543,14 +544,15 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
543
544
  all_catalog_entities = {**catalog_nodes, **catalog_sources}
544
545
 
545
546
  nodes = extract_dbt_entities(
546
- all_manifest_entities,
547
- all_catalog_entities,
548
- sources_results,
549
- manifest_adapter,
550
- self.config.use_identifiers,
551
- self.config.tag_prefix,
552
- self.config.only_include_if_in_catalog,
553
- self.report,
547
+ all_manifest_entities=all_manifest_entities,
548
+ all_catalog_entities=all_catalog_entities,
549
+ sources_results=sources_results,
550
+ manifest_adapter=manifest_adapter,
551
+ use_identifiers=self.config.use_identifiers,
552
+ tag_prefix=self.config.tag_prefix,
553
+ only_include_if_in_catalog=self.config.only_include_if_in_catalog,
554
+ include_database_name=self.config.include_database_name,
555
+ report=self.report,
554
556
  )
555
557
 
556
558
  return (
@@ -57,15 +57,11 @@ def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
57
57
  # base assertions are violated, bail early
58
58
  return None
59
59
  m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref)
60
- if m:
61
- destination_table = m.group(1)
62
- else:
63
- destination_table = destination_ref
60
+ destination_table = m.group(1) if m else destination_ref
61
+
64
62
  m = re.search(r"ref\(\'(.*)\'\)", source_ref)
65
- if m:
66
- source_table = m.group(1)
67
- else:
68
- source_table = source_ref
63
+ source_table = m.group(1) if m else source_ref
64
+
69
65
  return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}"
70
66
 
71
67
 
@@ -13,6 +13,9 @@ from datahub.configuration.source_common import (
13
13
  )
14
14
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
17
+ StatefulIngestionConfigBase,
18
+ )
16
19
 
17
20
  # hide annoying debug errors from py4j
18
21
  logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -35,7 +38,11 @@ class S3(ConfigModel):
35
38
  )
36
39
 
37
40
 
38
- class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
41
+ class DeltaLakeSourceConfig(
42
+ PlatformInstanceConfigMixin,
43
+ EnvConfigMixin,
44
+ StatefulIngestionConfigBase,
45
+ ):
39
46
  base_path: str = Field(
40
47
  description="Path to table (s3 or local file system). If path is not a delta table path "
41
48
  "then all subfolders will be scanned to detect and ingest delta tables."
@@ -1,12 +1,14 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
3
 
4
- from datahub.ingestion.api.source import SourceReport
4
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
+ StaleEntityRemovalSourceReport,
6
+ )
5
7
  from datahub.utilities.lossy_collections import LossyList
6
8
 
7
9
 
8
10
  @dataclasses.dataclass
9
- class DeltaLakeSourceReport(SourceReport):
11
+ class DeltaLakeSourceReport(StaleEntityRemovalSourceReport):
10
12
  files_scanned = 0
11
13
  filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
12
14
 
@@ -2,7 +2,7 @@ import json
2
2
  import logging
3
3
  import os
4
4
  import time
5
- from typing import Dict, Iterable, List
5
+ from typing import Dict, Iterable, List, Optional
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from deltalake import DeltaTable
@@ -21,7 +21,7 @@ from datahub.ingestion.api.decorators import (
21
21
  platform_name,
22
22
  support_status,
23
23
  )
24
- from datahub.ingestion.api.source import Source, SourceReport
24
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
25
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
26
  from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
27
27
  from datahub.ingestion.source.aws.s3_util import (
@@ -36,6 +36,12 @@ from datahub.ingestion.source.delta_lake.delta_lake_utils import (
36
36
  read_delta_table,
37
37
  )
38
38
  from datahub.ingestion.source.delta_lake.report import DeltaLakeSourceReport
39
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
40
+ StaleEntityRemovalHandler,
41
+ )
42
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
43
+ StatefulIngestionSourceBase,
44
+ )
39
45
  from datahub.metadata.com.linkedin.pegasus2avro.common import Status
40
46
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
41
47
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
@@ -79,7 +85,7 @@ OPERATION_STATEMENT_TYPES = {
79
85
  @config_class(DeltaLakeSourceConfig)
80
86
  @support_status(SupportStatus.INCUBATING)
81
87
  @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
82
- class DeltaLakeSource(Source):
88
+ class DeltaLakeSource(StatefulIngestionSourceBase):
83
89
  """
84
90
  This plugin extracts:
85
91
  - Column types and schema associated with each delta table
@@ -100,9 +106,10 @@ class DeltaLakeSource(Source):
100
106
  storage_options: Dict[str, str]
101
107
 
102
108
  def __init__(self, config: DeltaLakeSourceConfig, ctx: PipelineContext):
103
- super().__init__(ctx)
109
+ super().__init__(config, ctx)
110
+ self.ctx = ctx
104
111
  self.source_config = config
105
- self.report = DeltaLakeSourceReport()
112
+ self.report: DeltaLakeSourceReport = DeltaLakeSourceReport()
106
113
  if self.source_config.is_s3:
107
114
  if (
108
115
  self.source_config.s3 is None
@@ -331,6 +338,14 @@ class DeltaLakeSource(Source):
331
338
  for folder in os.listdir(path):
332
339
  yield os.path.join(path, folder)
333
340
 
341
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
342
+ return [
343
+ *super().get_workunit_processors(),
344
+ StaleEntityRemovalHandler.create(
345
+ self, self.source_config, self.ctx
346
+ ).workunit_processor,
347
+ ]
348
+
334
349
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
335
350
  self.container_WU_creator = ContainerWUCreator(
336
351
  self.source_config.platform,
@@ -271,12 +271,12 @@ class DremioAPIOperations:
271
271
  self.cancel_query(job_id)
272
272
  raise DremioAPIException(
273
273
  f"Query execution timed out after {timeout} seconds"
274
- )
274
+ ) from None
275
275
  except RuntimeError as e:
276
- raise DremioAPIException(f"{str(e)}")
276
+ raise DremioAPIException() from e
277
277
 
278
278
  except requests.RequestException as e:
279
- raise DremioAPIException(f"Error executing query: {str(e)}")
279
+ raise DremioAPIException("Error executing query") from e
280
280
 
281
281
  def fetch_results(self, job_id: str) -> List[Dict]:
282
282
  """Fetch job results with status checking"""
@@ -683,11 +683,7 @@ class DremioAPIOperations:
683
683
  # Add end anchor for exact matching
684
684
  regex_pattern = regex_pattern + "$"
685
685
 
686
- for path in paths:
687
- if re.match(regex_pattern, path, re.IGNORECASE):
688
- return True
689
-
690
- return False
686
+ return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
691
687
 
692
688
  def should_include_container(self, path: List[str], name: str) -> bool:
693
689
  """
@@ -116,10 +116,7 @@ class SchemaFieldTypeMapper:
116
116
  data_type = data_type.lower()
117
117
  type_class = cls.FIELD_TYPE_MAPPING.get(data_type, NullTypeClass)
118
118
 
119
- if data_size:
120
- native_data_type = f"{data_type}({data_size})"
121
- else:
122
- native_data_type = data_type
119
+ native_data_type = f"{data_type}({data_size})" if data_size else data_type
123
120
 
124
121
  try:
125
122
  schema_field_type = SchemaFieldDataTypeClass(type=type_class())
@@ -168,8 +165,9 @@ class DremioAspects:
168
165
  )
169
166
 
170
167
  def get_container_urn(
171
- self, name: Optional[str] = None, path: Optional[List[str]] = []
168
+ self, name: Optional[str] = None, path: Optional[List[str]] = None
172
169
  ) -> str:
170
+ path = path or []
173
171
  container_key = self.get_container_key(name, path)
174
172
  return container_key.as_urn()
175
173
 
@@ -165,6 +165,10 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
165
165
  SourceCapability.PLATFORM_INSTANCE,
166
166
  "By default, platform_instance will use the AWS account id",
167
167
  )
168
+ @capability(
169
+ SourceCapability.CLASSIFICATION,
170
+ "Optionally enabled via `classification.enabled`",
171
+ )
168
172
  class DynamoDBSource(StatefulIngestionSourceBase):
169
173
  """
170
174
  This plugin extracts the following:
@@ -242,8 +246,10 @@ class DynamoDBSource(StatefulIngestionSourceBase):
242
246
  platform=self.platform,
243
247
  platform_instance=platform_instance,
244
248
  name=dataset_name,
249
+ env=self.config.env,
245
250
  )
246
251
  dataset_properties = DatasetPropertiesClass(
252
+ name=table_name,
247
253
  tags=[],
248
254
  customProperties={
249
255
  "table.arn": table_info["TableArn"],
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
32
32
  platform_name,
33
33
  support_status,
34
34
  )
35
- from datahub.ingestion.api.source import Source, SourceReport
35
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
37
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
38
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
39
+ StaleEntityRemovalHandler,
40
+ StaleEntityRemovalSourceReport,
41
+ )
42
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
43
+ StatefulIngestionConfigBase,
44
+ StatefulIngestionSourceBase,
45
+ )
38
46
  from datahub.ingestion.source_config.operation_config import (
39
47
  OperationConfig,
40
48
  is_profiling_enabled,
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
188
196
 
189
197
 
190
198
  @dataclass
191
- class ElasticsearchSourceReport(SourceReport):
199
+ class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
192
200
  index_scanned: int = 0
193
201
  filtered: LossyList[str] = field(default_factory=LossyList)
194
202
 
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
240
248
  )
241
249
 
242
250
 
243
- class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
251
+ class ElasticsearchSourceConfig(
252
+ StatefulIngestionConfigBase,
253
+ PlatformInstanceConfigMixin,
254
+ EnvConfigMixin,
255
+ ):
244
256
  host: str = Field(
245
257
  default="localhost:9200", description="The elastic search host URI."
246
258
  )
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
337
349
  @config_class(ElasticsearchSourceConfig)
338
350
  @support_status(SupportStatus.CERTIFIED)
339
351
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
340
- class ElasticsearchSource(Source):
352
+ class ElasticsearchSource(StatefulIngestionSourceBase):
341
353
  """
342
354
  This plugin extracts the following:
343
355
 
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
346
358
  """
347
359
 
348
360
  def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
349
- super().__init__(ctx)
361
+ super().__init__(config, ctx)
350
362
  self.source_config = config
351
363
  self.client = Elasticsearch(
352
364
  self.source_config.host,
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
361
373
  ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
362
374
  url_prefix=self.source_config.url_prefix,
363
375
  )
364
- self.report = ElasticsearchSourceReport()
376
+ self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
365
377
  self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
366
378
  self.platform: str = "elasticsearch"
367
379
  self.cat_response: Optional[List[Dict[str, Any]]] = None
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
373
385
  config = ElasticsearchSourceConfig.parse_obj(config_dict)
374
386
  return cls(config, ctx)
375
387
 
388
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
389
+ return [
390
+ *super().get_workunit_processors(),
391
+ StaleEntityRemovalHandler.create(
392
+ self, self.source_config, self.ctx
393
+ ).workunit_processor,
394
+ ]
395
+
376
396
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
377
397
  indices = self.client.indices.get_alias()
378
398
  for index in indices:
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
20
20
  from pydantic import Field
21
21
 
22
22
  import datahub.emitter.mce_builder as builder
23
- from datahub.configuration.common import ConfigModel
24
23
  from datahub.emitter.mce_builder import DEFAULT_ENV
25
24
  from datahub.ingestion.api.common import PipelineContext
26
25
  from datahub.ingestion.api.decorators import (
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
31
30
  platform_name,
32
31
  support_status,
33
32
  )
34
- from datahub.ingestion.api.source import Source, SourceReport
33
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
36
+ StaleEntityRemovalHandler,
37
+ StaleEntityRemovalSourceReport,
38
+ )
39
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
40
+ StatefulIngestionConfigBase,
41
+ StatefulIngestionSourceBase,
42
+ )
36
43
  from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
37
44
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
38
45
  MLFeatureSnapshot,
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
86
93
  }
87
94
 
88
95
 
89
- class FeastRepositorySourceConfig(ConfigModel):
96
+ class FeastRepositorySourceConfig(
97
+ StatefulIngestionConfigBase,
98
+ ):
90
99
  path: str = Field(description="Path to Feast repository")
91
100
  fs_yaml_file: Optional[str] = Field(
92
101
  default=None,
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
122
131
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
123
132
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
124
133
  @dataclass
125
- class FeastRepositorySource(Source):
134
+ class FeastRepositorySource(StatefulIngestionSourceBase):
126
135
  """
127
136
  This plugin extracts:
128
137
 
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
135
144
 
136
145
  platform = "feast"
137
146
  source_config: FeastRepositorySourceConfig
138
- report: SourceReport
147
+ report: StaleEntityRemovalSourceReport
139
148
  feature_store: FeatureStore
140
149
 
141
150
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
142
- super().__init__(ctx)
151
+ super().__init__(config, ctx)
143
152
  self.source_config = config
144
- self.report = SourceReport()
153
+ self.ctx = ctx
154
+ self.report = StaleEntityRemovalSourceReport()
145
155
  self.feature_store = FeatureStore(
146
156
  repo_path=self.source_config.path,
147
157
  fs_yaml_file=self.source_config.fs_yaml_file,
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
158
168
 
159
169
  if ml_feature_data_type is None:
160
170
  self.report.report_warning(
161
- parent_name, f"unable to map type {field_type} to metadata schema"
171
+ "unable to map type",
172
+ f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
162
173
  )
163
174
 
164
175
  ml_feature_data_type = MLFeatureDataType.UNKNOWN
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
456
467
  config = FeastRepositorySourceConfig.parse_obj(config_dict)
457
468
  return cls(config, ctx)
458
469
 
470
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
471
+ return [
472
+ *super().get_workunit_processors(),
473
+ StaleEntityRemovalHandler.create(
474
+ self, self.source_config, self.ctx
475
+ ).workunit_processor,
476
+ ]
477
+
459
478
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
460
479
  for feature_view in self.feature_store.list_feature_views():
461
480
  for entity_name in feature_view.entities:
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
351
351
  self.report.add_deserialize_time(deserialize_duration)
352
352
  yield i, item
353
353
  except Exception as e:
354
- self.report.report_failure(f"path-{i}", str(e))
354
+ self.report.report_failure(f"{file_status.path}-{i}", str(e))
355
355
 
356
356
  @staticmethod
357
357
  def test_connection(config_dict: dict) -> TestConnectionReport:
@@ -410,10 +410,13 @@ def _from_obj_for_file(
410
410
  item = MetadataChangeEvent.from_obj(obj)
411
411
  elif "aspect" in obj:
412
412
  item = MetadataChangeProposalWrapper.from_obj(obj)
413
- else:
413
+ elif "bucket" in obj:
414
414
  item = UsageAggregationClass.from_obj(obj)
415
+ else:
416
+ raise ValueError(f"Unknown object type: {obj}")
417
+
415
418
  if not item.validate():
416
- raise ValueError(f"failed to parse: {obj}")
419
+ raise ValueError(f"Failed to parse: {obj}")
417
420
 
418
421
  if isinstance(item, UsageAggregationClass):
419
422
  logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
@@ -498,7 +498,7 @@ class DataProcessCleanup:
498
498
  # Delete empty dataflows if needed
499
499
  if self.config.delete_empty_data_flows:
500
500
  deleted_data_flows: int = 0
501
- for key in dataFlows.keys():
501
+ for key in dataFlows:
502
502
  if not dataJobs.get(key) or len(dataJobs[key]) == 0:
503
503
  logger.info(
504
504
  f"Deleting dataflow {key} because there are not datajobs"
@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
130
130
  )
131
131
 
132
132
  def _scroll_execution_requests(
133
- self, overrides: Dict[str, Any] = {}
133
+ self, overrides: Optional[Dict[str, Any]] = None
134
134
  ) -> Iterator[CleanupRecord]:
135
+ overrides = overrides or {}
135
136
  headers: Dict[str, Any] = {
136
137
  "Accept": "application/json",
137
138
  "Content-Type": "application/json",
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
170
170
  ).select_from(self._table)
171
171
  )
172
172
  return convert_to_json_serializable(element_values.fetchone()[0])
173
- elif self.engine.dialect.name.lower() == BIGQUERY:
174
- element_values = self.engine.execute(
175
- sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
176
- self._table
177
- )
178
- )
179
- return convert_to_json_serializable(element_values.fetchone()[0])
180
- elif self.engine.dialect.name.lower() == SNOWFLAKE:
173
+ elif (
174
+ self.engine.dialect.name.lower() == BIGQUERY
175
+ or self.engine.dialect.name.lower() == SNOWFLAKE
176
+ ):
181
177
  element_values = self.engine.execute(
182
178
  sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
183
179
  self._table
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
381
377
  col = col_dict["name"]
382
378
  self.column_types[col] = str(col_dict["type"])
383
379
  # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
384
- if not self.config._allow_deny_patterns.allowed(
385
- f"{self.dataset_name}.{col}"
380
+ if (
381
+ not self.config._allow_deny_patterns.allowed(
382
+ f"{self.dataset_name}.{col}"
383
+ )
384
+ or not self.config.profile_nested_fields
385
+ and "." in col
386
386
  ):
387
387
  ignored_columns_by_pattern.append(col)
388
- # We try to ignore nested columns as well
389
- elif not self.config.profile_nested_fields and "." in col:
390
- ignored_columns_by_pattern.append(col)
391
388
  elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
392
389
  ignored_columns_by_type.append(col)
393
390
  else:
@@ -605,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
605
602
  if not self.config.include_field_median_value:
606
603
  return
607
604
  try:
608
- if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
605
+ if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
609
606
  column_profile.median = str(
610
607
  self.dataset.engine.execute(
611
608
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
1408
1405
  },
1409
1406
  )
1410
1407
 
1411
- if platform == BIGQUERY or platform == DATABRICKS:
1408
+ if platform in (BIGQUERY, DATABRICKS):
1412
1409
  # This is done as GE makes the name as DATASET.TABLE
1413
1410
  # but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
1414
1411
  name_parts = pretty_name.split(".")
@@ -2,8 +2,9 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
- from typing import Any, Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
6
6
 
7
+ from dateutil import parser as dateutil_parser
7
8
  from pyiceberg.catalog import Catalog
8
9
  from pyiceberg.exceptions import (
9
10
  NoSuchIcebergTableError,
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
81
82
  OwnerClass,
82
83
  OwnershipClass,
83
84
  OwnershipTypeClass,
85
+ TimeStampClass,
84
86
  )
85
87
  from datahub.utilities.perf_timer import PerfTimer
86
88
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
183
185
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
184
186
  thread_local = threading.local()
185
187
 
186
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
187
- LOGGER.debug(f"Processing dataset for path {dataset_path}")
188
- dataset_name = ".".join(dataset_path)
189
- if not self.config.table_pattern.allowed(dataset_name):
190
- # Dataset name is rejected by pattern, report as dropped.
191
- self.report.report_dropped(dataset_name)
192
- LOGGER.debug(
193
- f"Skipping table {dataset_name} due to not being allowed by the config pattern"
194
- )
195
- return
188
+ def _try_processing_dataset(
189
+ dataset_path: Tuple[str, ...], dataset_name: str
190
+ ) -> Iterable[MetadataWorkUnit]:
196
191
  try:
197
192
  if not hasattr(thread_local, "local_catalog"):
198
193
  LOGGER.debug(
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
248
243
  LOGGER.warning(
249
244
  f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
250
245
  )
246
+ except ValueError as e:
247
+ if "Could not initialize FileIO" not in str(e):
248
+ raise
249
+ self.report.warning(
250
+ "Could not initialize FileIO",
251
+ f"Could not initialize FileIO for {dataset_path} due to: {e}",
252
+ )
253
+
254
+ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
255
+ try:
256
+ LOGGER.debug(f"Processing dataset for path {dataset_path}")
257
+ dataset_name = ".".join(dataset_path)
258
+ if not self.config.table_pattern.allowed(dataset_name):
259
+ # Dataset name is rejected by pattern, report as dropped.
260
+ self.report.report_dropped(dataset_name)
261
+ LOGGER.debug(
262
+ f"Skipping table {dataset_name} due to not being allowed by the config pattern"
263
+ )
264
+ return
265
+
266
+ yield from _try_processing_dataset(dataset_path, dataset_name)
251
267
  except Exception as e:
252
268
  self.report.report_failure(
253
269
  "general",
254
- f"Failed to create workunit for dataset {dataset_name}: {e}",
270
+ f"Failed to create workunit for dataset {dataset_path}: {e}",
255
271
  )
256
272
  LOGGER.exception(
257
273
  f"Exception while processing table {dataset_path}, skipping it.",
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
288
304
  )
289
305
 
290
306
  # Dataset properties aspect.
307
+ additional_properties = {}
291
308
  custom_properties = table.metadata.properties.copy()
292
309
  custom_properties["location"] = table.metadata.location
293
310
  custom_properties["format-version"] = str(table.metadata.format_version)
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
299
316
  custom_properties["manifest-list"] = (
300
317
  table.current_snapshot().manifest_list
301
318
  )
319
+ additional_properties["lastModified"] = TimeStampClass(
320
+ int(table.current_snapshot().timestamp_ms)
321
+ )
322
+ if "created-at" in custom_properties:
323
+ try:
324
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
+ additional_properties["created"] = TimeStampClass(
326
+ int(dt.timestamp() * 1000)
327
+ )
328
+ except Exception as ex:
329
+ LOGGER.warning(
330
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
+ )
332
+
302
333
  dataset_properties = DatasetPropertiesClass(
303
334
  name=table.name()[-1],
304
335
  description=table.metadata.properties.get("comment", None),
305
336
  customProperties=custom_properties,
337
+ lastModified=additional_properties.get("lastModified"),
338
+ created=additional_properties.get("created"),
339
+ qualifiedName=dataset_name,
306
340
  )
307
341
  dataset_snapshot.aspects.append(dataset_properties)
308
342
  # Dataset ownership aspect.