acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -22,7 +22,9 @@ from requests_gssapi import HTTPSPNEGOAuth
22
22
 
23
23
  import datahub.emitter.mce_builder as builder
24
24
  from datahub.configuration.common import AllowDenyPattern
25
- from datahub.configuration.source_common import EnvConfigMixin
25
+ from datahub.configuration.source_common import (
26
+ EnvConfigMixin,
27
+ )
26
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
27
29
  from datahub.emitter.mcp_builder import ContainerKey, gen_containers
28
30
  from datahub.ingestion.api.common import PipelineContext
@@ -33,9 +35,21 @@ from datahub.ingestion.api.decorators import (
33
35
  platform_name,
34
36
  support_status,
35
37
  )
36
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
38
+ from datahub.ingestion.api.source import (
39
+ MetadataWorkUnitProcessor,
40
+ SourceCapability,
41
+ SourceReport,
42
+ )
37
43
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
44
  from datahub.ingestion.source.common.subtypes import JobContainerSubTypes
45
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
46
+ StaleEntityRemovalHandler,
47
+ StaleEntityRemovalSourceReport,
48
+ )
49
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
50
+ StatefulIngestionConfigBase,
51
+ StatefulIngestionSourceBase,
52
+ )
39
53
  from datahub.metadata.schema_classes import (
40
54
  BrowsePathEntryClass,
41
55
  BrowsePathsV2Class,
@@ -81,7 +95,7 @@ class ProcessGroupKey(ContainerKey):
81
95
  process_group_id: str
82
96
 
83
97
 
84
- class NifiSourceConfig(EnvConfigMixin):
98
+ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
85
99
  site_url: str = Field(
86
100
  description="URL for Nifi, ending with /nifi/. e.g. https://mynifi.domain/nifi/"
87
101
  )
@@ -452,7 +466,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
452
466
 
453
467
 
454
468
  @dataclass
455
- class NifiSourceReport(SourceReport):
469
+ class NifiSourceReport(StaleEntityRemovalSourceReport):
456
470
  filtered: LossyList[str] = field(default_factory=LossyList)
457
471
 
458
472
  def report_dropped(self, ent_name: str) -> None:
@@ -464,13 +478,14 @@ class NifiSourceReport(SourceReport):
464
478
  @config_class(NifiSourceConfig)
465
479
  @support_status(SupportStatus.CERTIFIED)
466
480
  @capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations")
467
- class NifiSource(Source):
481
+ class NifiSource(StatefulIngestionSourceBase):
468
482
  config: NifiSourceConfig
469
483
  report: NifiSourceReport
470
484
 
471
485
  def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None:
472
- super().__init__(ctx)
486
+ super().__init__(config, ctx)
473
487
  self.config = config
488
+ self.ctx = ctx
474
489
  self.report = NifiSourceReport()
475
490
  self.session = requests.Session()
476
491
 
@@ -488,7 +503,7 @@ class NifiSource(Source):
488
503
  def get_report(self) -> SourceReport:
489
504
  return self.report
490
505
 
491
- def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901
506
+ def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None:
492
507
  """
493
508
  Update self.nifi_flow with contents of the input process group `pg_flow_dto`
494
509
  """
@@ -894,7 +909,7 @@ class NifiSource(Source):
894
909
  if not delete_response.ok:
895
910
  logger.error("failed to delete provenance ", provenance_uri)
896
911
 
897
- def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
912
+ def construct_workunits(self) -> Iterable[MetadataWorkUnit]:
898
913
  rootpg = self.nifi_flow.root_process_group
899
914
  flow_name = rootpg.name # self.config.site_name
900
915
  flow_urn = self.make_flow_urn()
@@ -1151,6 +1166,14 @@ class NifiSource(Source):
1151
1166
  token_response.raise_for_status()
1152
1167
  self.session.headers.update({"Authorization": "Bearer " + token_response.text})
1153
1168
 
1169
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
1170
+ return [
1171
+ *super().get_workunit_processors(),
1172
+ StaleEntityRemovalHandler.create(
1173
+ self, self.config, self.ctx
1174
+ ).workunit_processor,
1175
+ ]
1176
+
1154
1177
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1155
1178
  try:
1156
1179
  self.authenticate()
@@ -1211,11 +1234,14 @@ class NifiSource(Source):
1211
1234
  job_type: str,
1212
1235
  description: Optional[str],
1213
1236
  job_properties: Optional[Dict[str, str]] = None,
1214
- inlets: List[str] = [],
1215
- outlets: List[str] = [],
1216
- inputJobs: List[str] = [],
1237
+ inlets: Optional[List[str]] = None,
1238
+ outlets: Optional[List[str]] = None,
1239
+ inputJobs: Optional[List[str]] = None,
1217
1240
  status: Optional[str] = None,
1218
1241
  ) -> Iterable[MetadataWorkUnit]:
1242
+ inlets = inlets or []
1243
+ outlets = outlets or []
1244
+ inputJobs = inputJobs or []
1219
1245
  logger.debug(f"Begining construction of job workunit for {job_urn}")
1220
1246
  if job_properties:
1221
1247
  job_properties = {k: v for k, v in job_properties.items() if v is not None}
@@ -270,7 +270,7 @@ class APISource(Source, ABC):
270
270
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
271
271
  return ApiWorkUnit(id=dataset_name, mce=mce)
272
272
 
273
- def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901
273
+ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:
274
274
  config = self.config
275
275
 
276
276
  sw_dict = self.config.get_swagger()
@@ -12,7 +12,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
12
12
  SchemaField,
13
13
  SchemaMetadata,
14
14
  )
15
- from datahub.metadata.schema_classes import SchemaFieldDataTypeClass, StringTypeClass
15
+ from datahub.metadata.schema_classes import (
16
+ RecordTypeClass,
17
+ SchemaFieldDataTypeClass,
18
+ StringTypeClass,
19
+ )
16
20
 
17
21
  logger = logging.getLogger(__name__)
18
22
 
@@ -20,9 +24,12 @@ logger = logging.getLogger(__name__)
20
24
  def flatten(d: dict, prefix: str = "") -> Generator:
21
25
  for k, v in d.items():
22
26
  if isinstance(v, dict):
27
+ # First yield the parent field
28
+ yield f"{prefix}.{k}".strip(".")
29
+ # Then yield all nested fields
23
30
  yield from flatten(v, f"{prefix}.{k}")
24
31
  else:
25
- yield f"{prefix}-{k}".strip(".")
32
+ yield f"{prefix}.{k}".strip(".") # Use dot instead of hyphen
26
33
 
27
34
 
28
35
  def flatten2list(d: dict) -> list:
@@ -34,7 +41,7 @@ def flatten2list(d: dict) -> list:
34
41
  "anotherone": {"third_a": {"last": 3}}
35
42
  }
36
43
 
37
- yeilds:
44
+ yields:
38
45
 
39
46
  ["first.second_a",
40
47
  "first.second_b",
@@ -43,7 +50,7 @@ def flatten2list(d: dict) -> list:
43
50
  ]
44
51
  """
45
52
  fl_l = list(flatten(d))
46
- return [d[1:] if d[0] == "-" else d for d in fl_l]
53
+ return fl_l
47
54
 
48
55
 
49
56
  def request_call(
@@ -111,7 +118,7 @@ def check_sw_version(sw_dict: dict) -> None:
111
118
  )
112
119
 
113
120
 
114
- def get_endpoints(sw_dict: dict) -> dict: # noqa: C901
121
+ def get_endpoints(sw_dict: dict) -> dict:
115
122
  """
116
123
  Get all the URLs, together with their description and the tags
117
124
  """
@@ -160,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
160
167
  Try to determine if example data is defined for the endpoint, and return it
161
168
  """
162
169
  data = {}
163
- if "content" in base_res.keys():
170
+ if "content" in base_res:
164
171
  res_cont = base_res["content"]
165
172
  if "application/json" in res_cont.keys():
166
173
  ex_field = None
@@ -181,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
181
188
  )
182
189
  elif "text/csv" in res_cont.keys():
183
190
  data = res_cont["text/csv"]["schema"]
184
- elif "examples" in base_res.keys():
191
+ elif "examples" in base_res:
185
192
  data = base_res["examples"]["application/json"]
186
193
 
187
194
  return data
@@ -322,6 +329,8 @@ def extract_fields(
322
329
  return ["contains_a_string"], {"contains_a_string": dict_data[0]}
323
330
  else:
324
331
  raise ValueError("unknown format")
332
+ elif not dict_data: # Handle empty dict case
333
+ return [], {}
325
334
  if len(dict_data) > 1:
326
335
  # the elements are directly inside the dict
327
336
  return flatten2list(dict_data), dict_data
@@ -384,16 +393,39 @@ def set_metadata(
384
393
  dataset_name: str, fields: List, platform: str = "api"
385
394
  ) -> SchemaMetadata:
386
395
  canonical_schema: List[SchemaField] = []
387
-
388
- for column in fields:
389
- field = SchemaField(
390
- fieldPath=column,
391
- nativeDataType="str",
392
- type=SchemaFieldDataTypeClass(type=StringTypeClass()),
393
- description="",
394
- recursive=False,
395
- )
396
- canonical_schema.append(field)
396
+ seen_paths = set()
397
+
398
+ # Process all flattened fields
399
+ for field_path in fields:
400
+ parts = field_path.split(".")
401
+
402
+ # Add struct/object fields for each ancestor path
403
+ current_path: List[str] = []
404
+ for part in parts[:-1]:
405
+ ancestor_path = ".".join(current_path + [part])
406
+ if ancestor_path not in seen_paths:
407
+ struct_field = SchemaField(
408
+ fieldPath=ancestor_path,
409
+ nativeDataType="object", # OpenAPI term for struct/record
410
+ type=SchemaFieldDataTypeClass(type=RecordTypeClass()),
411
+ description="",
412
+ recursive=False,
413
+ )
414
+ canonical_schema.append(struct_field)
415
+ seen_paths.add(ancestor_path)
416
+ current_path.append(part)
417
+
418
+ # Add the leaf field if not already seen
419
+ if field_path not in seen_paths:
420
+ leaf_field = SchemaField(
421
+ fieldPath=field_path,
422
+ nativeDataType="str", # Keeping `str` for backwards compatability, ideally this is the correct type
423
+ type=SchemaFieldDataTypeClass(type=StringTypeClass()),
424
+ description="",
425
+ recursive=False,
426
+ )
427
+ canonical_schema.append(leaf_field)
428
+ seen_paths.add(field_path)
397
429
 
398
430
  schema_metadata = SchemaMetadata(
399
431
  schemaName=dataset_name,
@@ -2,7 +2,7 @@ import functools
2
2
  import importlib.resources as pkg_resource
3
3
  import logging
4
4
  import os
5
- from typing import Dict, List
5
+ from typing import Dict, List, Optional
6
6
 
7
7
  import lark
8
8
  from lark import Lark, Tree
@@ -65,8 +65,9 @@ def get_upstream_tables(
65
65
  platform_instance_resolver: AbstractDataPlatformInstanceResolver,
66
66
  ctx: PipelineContext,
67
67
  config: PowerBiDashboardSourceConfig,
68
- parameters: Dict[str, str] = {},
68
+ parameters: Optional[Dict[str, str]] = None,
69
69
  ) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
70
+ parameters = parameters or {}
70
71
  if table.expression is None:
71
72
  logger.debug(f"There is no M-Query expression in table {table.full_name}")
72
73
  return []
@@ -70,13 +70,14 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
70
70
  return expression_tree
71
71
 
72
72
 
73
- def token_values(tree: Tree, parameters: Dict[str, str] = {}) -> List[str]:
73
+ def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
74
74
  """
75
75
  :param tree: Tree to traverse
76
76
  :param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
77
77
  using the values in 'parameters'.
78
78
  :return: List of leaf token data
79
79
  """
80
+ parameters = parameters or {}
80
81
  values: List[str] = []
81
82
 
82
83
  def internal(node: Union[Tree, Token]) -> None:
@@ -890,9 +890,7 @@ class Mapper:
890
890
  set(user_rights) & set(self.__config.ownership.owner_criteria)
891
891
  )
892
892
  > 0
893
- ):
894
- user_mcps.extend(self.to_datahub_user(user))
895
- elif self.__config.ownership.owner_criteria is None:
893
+ ) or self.__config.ownership.owner_criteria is None:
896
894
  user_mcps.extend(self.to_datahub_user(user))
897
895
  else:
898
896
  continue
@@ -380,8 +380,9 @@ class DataResolverBase(ABC):
380
380
  def itr_pages(
381
381
  self,
382
382
  endpoint: str,
383
- parameter_override: Dict = {},
383
+ parameter_override: Optional[Dict] = None,
384
384
  ) -> Iterator[List[Dict]]:
385
+ parameter_override = parameter_override or {}
385
386
  params: dict = {
386
387
  "$skip": 0,
387
388
  "$top": self.TOP,
@@ -14,7 +14,9 @@ from requests_ntlm import HttpNtlmAuth
14
14
 
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.common import AllowDenyPattern
17
- from datahub.configuration.source_common import EnvConfigMixin
17
+ from datahub.configuration.source_common import (
18
+ EnvConfigMixin,
19
+ )
18
20
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
21
  from datahub.ingestion.api.common import PipelineContext
20
22
  from datahub.ingestion.api.decorators import (
@@ -25,7 +27,7 @@ from datahub.ingestion.api.decorators import (
25
27
  platform_name,
26
28
  support_status,
27
29
  )
28
- from datahub.ingestion.api.source import Source, SourceReport
30
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
29
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
32
  from datahub.ingestion.source.powerbi_report_server.constants import (
31
33
  API_ENDPOINTS,
@@ -39,6 +41,14 @@ from datahub.ingestion.source.powerbi_report_server.report_server_domain import
39
41
  PowerBiReport,
40
42
  Report,
41
43
  )
44
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
45
+ StaleEntityRemovalHandler,
46
+ StaleEntityRemovalSourceReport,
47
+ )
48
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
+ StatefulIngestionConfigBase,
50
+ StatefulIngestionSourceBase,
51
+ )
42
52
  from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps
43
53
  from datahub.metadata.schema_classes import (
44
54
  BrowsePathsClass,
@@ -58,7 +68,7 @@ from datahub.utilities.lossy_collections import LossyList
58
68
  LOGGER = logging.getLogger(__name__)
59
69
 
60
70
 
61
- class PowerBiReportServerAPIConfig(EnvConfigMixin):
71
+ class PowerBiReportServerAPIConfig(StatefulIngestionConfigBase, EnvConfigMixin):
62
72
  username: str = pydantic.Field(description="Windows account username")
63
73
  password: str = pydantic.Field(description="Windows account password")
64
74
  workstation_name: str = pydantic.Field(
@@ -186,7 +196,7 @@ class PowerBiReportServerAPI:
186
196
  }
187
197
 
188
198
  reports: List[Any] = []
189
- for report_type in report_types_mapping.keys():
199
+ for report_type in report_types_mapping:
190
200
  report_get_endpoint: str = API_ENDPOINTS[report_type]
191
201
  # Replace place holders
192
202
  report_get_endpoint_http = report_get_endpoint.format(
@@ -475,7 +485,7 @@ class Mapper:
475
485
 
476
486
 
477
487
  @dataclass
478
- class PowerBiReportServerDashboardSourceReport(SourceReport):
488
+ class PowerBiReportServerDashboardSourceReport(StaleEntityRemovalSourceReport):
479
489
  scanned_report: int = 0
480
490
  filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
481
491
 
@@ -490,7 +500,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
490
500
  @config_class(PowerBiReportServerDashboardSourceConfig)
491
501
  @support_status(SupportStatus.INCUBATING)
492
502
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
493
- class PowerBiReportServerDashboardSource(Source):
503
+ class PowerBiReportServerDashboardSource(StatefulIngestionSourceBase):
494
504
  """
495
505
  Use this plugin to connect to [PowerBI Report Server](https://powerbi.microsoft.com/en-us/report-server/).
496
506
  It extracts the following:
@@ -520,8 +530,9 @@ class PowerBiReportServerDashboardSource(Source):
520
530
  def __init__(
521
531
  self, config: PowerBiReportServerDashboardSourceConfig, ctx: PipelineContext
522
532
  ):
523
- super().__init__(ctx)
533
+ super().__init__(config, ctx)
524
534
  self.source_config = config
535
+ self.ctx = ctx
525
536
  self.report = PowerBiReportServerDashboardSourceReport()
526
537
  self.auth = PowerBiReportServerAPI(self.source_config).get_auth_credentials
527
538
  self.powerbi_client = PowerBiReportServerAPI(self.source_config)
@@ -532,6 +543,14 @@ class PowerBiReportServerDashboardSource(Source):
532
543
  config = PowerBiReportServerDashboardSourceConfig.parse_obj(config_dict)
533
544
  return cls(config, ctx)
534
545
 
546
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
547
+ return [
548
+ *super().get_workunit_processors(),
549
+ StaleEntityRemovalHandler.create(
550
+ self, self.source_config, self.ctx
551
+ ).workunit_processor,
552
+ ]
553
+
535
554
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
536
555
  """
537
556
  Datahub Ingestion framework invoke this method
@@ -33,7 +33,7 @@ class CatalogItem(BaseModel):
33
33
  )
34
34
 
35
35
  @validator("display_name", always=True)
36
- def validate_diplay_name(cls, value, values): # noqa: N805
36
+ def validate_diplay_name(cls, value, values):
37
37
  if values["created_by"]:
38
38
  return values["created_by"].split("\\")[-1]
39
39
  return ""
@@ -16,10 +16,13 @@ from datahub.ingestion.api.decorators import (
16
16
  support_status,
17
17
  )
18
18
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
19
- StaleEntityRemovalSourceReport,
20
19
  StatefulStaleMetadataRemovalConfig,
21
20
  )
22
- from datahub.ingestion.source.superset import SupersetConfig, SupersetSource
21
+ from datahub.ingestion.source.superset import (
22
+ SupersetConfig,
23
+ SupersetSource,
24
+ SupersetSourceReport,
25
+ )
23
26
  from datahub.utilities import config_clean
24
27
 
25
28
  logger = logging.getLogger(__name__)
@@ -76,7 +79,7 @@ class PresetSource(SupersetSource):
76
79
  """
77
80
 
78
81
  config: PresetConfig
79
- report: StaleEntityRemovalSourceReport
82
+ report: SupersetSourceReport
80
83
  platform = "preset"
81
84
 
82
85
  def __init__(self, ctx: PipelineContext, config: PresetConfig):
@@ -84,7 +87,7 @@ class PresetSource(SupersetSource):
84
87
 
85
88
  super().__init__(ctx, config)
86
89
  self.config = config
87
- self.report = StaleEntityRemovalSourceReport()
90
+ self.report = SupersetSourceReport()
88
91
  self.platform = "preset"
89
92
 
90
93
  def login(self):
@@ -116,6 +116,7 @@ class PulsarSource(StatefulIngestionSourceBase):
116
116
  def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext):
117
117
  super().__init__(config, ctx)
118
118
  self.platform: str = "pulsar"
119
+ self.ctx = ctx
119
120
  self.config: PulsarSourceConfig = config
120
121
  self.report: PulsarSourceReport = PulsarSourceReport()
121
122
 
@@ -229,8 +230,8 @@ class PulsarSource(StatefulIngestionSourceBase):
229
230
  self.report.report_warning("HTTPError", message)
230
231
  except requests.exceptions.RequestException as e:
231
232
  raise Exception(
232
- f"An ambiguous exception occurred while handling the request: {e}"
233
- )
233
+ "An ambiguous exception occurred while handling the request"
234
+ ) from e
234
235
 
235
236
  @classmethod
236
237
  def create(cls, config_dict, ctx):
@@ -17,8 +17,9 @@ class WebsocketConnection:
17
17
  self.handle = [-1]
18
18
 
19
19
  def _build_websocket_request_dict(
20
- self, method: str, params: Union[Dict, List] = {}
20
+ self, method: str, params: Optional[Union[Dict, List]] = None
21
21
  ) -> Dict:
22
+ params = params or {}
22
23
  return {
23
24
  "jsonrpc": "2.0",
24
25
  "id": self.request_id,
@@ -37,11 +38,12 @@ class WebsocketConnection:
37
38
  return {}
38
39
 
39
40
  def websocket_send_request(
40
- self, method: str, params: Union[Dict, List] = {}
41
+ self, method: str, params: Optional[Union[Dict, List]] = None
41
42
  ) -> Dict:
42
43
  """
43
44
  Method to send request to websocket
44
45
  """
46
+ params = params or {}
45
47
  self.request_id += 1
46
48
  request = self._build_websocket_request_dict(method, params)
47
49
  response = self._send_request(request=request)
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
12
12
  from urllib3.util.retry import Retry
13
13
 
14
14
  import datahub.emitter.mce_builder as builder
15
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
15
+ from datahub.configuration.common import AllowDenyPattern
16
16
  from datahub.emitter.mce_builder import DEFAULT_ENV
17
17
  from datahub.ingestion.api.common import PipelineContext
18
18
  from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
@@ -22,8 +22,20 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
25
+ from datahub.ingestion.api.source import (
26
+ MetadataWorkUnitProcessor,
27
+ SourceCapability,
28
+ SourceReport,
29
+ )
26
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
32
+ StaleEntityRemovalHandler,
33
+ StaleEntityRemovalSourceReport,
34
+ )
35
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
+ StatefulIngestionConfigBase,
37
+ StatefulIngestionSourceBase,
38
+ )
27
39
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
28
40
  AuditStamp,
29
41
  ChangeAuditStamps,
@@ -235,7 +247,9 @@ def get_full_qualified_name(platform: str, database_name: str, table_name: str)
235
247
  return f"{database_name}.{table_name}"
236
248
 
237
249
 
238
- class RedashConfig(ConfigModel):
250
+ class RedashConfig(
251
+ StatefulIngestionConfigBase,
252
+ ):
239
253
  # See the Redash API for details
240
254
  # https://redash.io/help/user-guide/integrations-and-api/api
241
255
  connect_uri: str = Field(
@@ -277,7 +291,7 @@ class RedashConfig(ConfigModel):
277
291
 
278
292
 
279
293
  @dataclass
280
- class RedashSourceReport(SourceReport):
294
+ class RedashSourceReport(StaleEntityRemovalSourceReport):
281
295
  items_scanned: int = 0
282
296
  filtered: LossyList[str] = field(default_factory=LossyList)
283
297
  queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
@@ -305,7 +319,7 @@ class RedashSourceReport(SourceReport):
305
319
  @config_class(RedashConfig)
306
320
  @support_status(SupportStatus.INCUBATING)
307
321
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
308
- class RedashSource(Source):
322
+ class RedashSource(StatefulIngestionSourceBase):
309
323
  """
310
324
  This plugin extracts the following:
311
325
 
@@ -316,8 +330,9 @@ class RedashSource(Source):
316
330
  platform = "redash"
317
331
 
318
332
  def __init__(self, ctx: PipelineContext, config: RedashConfig):
319
- super().__init__(ctx)
333
+ super().__init__(config, ctx)
320
334
  self.config: RedashConfig = config
335
+ self.ctx = ctx
321
336
  self.report: RedashSourceReport = RedashSourceReport()
322
337
 
323
338
  # Handle trailing slash removal
@@ -406,8 +421,9 @@ class RedashSource(Source):
406
421
  return database_name
407
422
 
408
423
  def _get_datasource_urns(
409
- self, data_source: Dict, sql_query_data: Dict = {}
424
+ self, data_source: Dict, sql_query_data: Optional[Dict] = None
410
425
  ) -> Optional[List[str]]:
426
+ sql_query_data = sql_query_data or {}
411
427
  platform = self._get_platform_based_on_datasource(data_source)
412
428
  database_name = self._get_database_name_based_on_datasource(data_source)
413
429
  data_source_syntax = data_source.get("syntax")
@@ -724,6 +740,14 @@ class RedashSource(Source):
724
740
  def add_config_to_report(self) -> None:
725
741
  self.report.api_page_limit = self.config.api_page_limit
726
742
 
743
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
744
+ return [
745
+ *super().get_workunit_processors(),
746
+ StaleEntityRemovalHandler.create(
747
+ self, self.config, self.ctx
748
+ ).workunit_processor,
749
+ ]
750
+
727
751
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
728
752
  self.validate_connection()
729
753
  self.add_config_to_report()
@@ -128,6 +128,10 @@ class RedshiftConfig(
128
128
  default=True,
129
129
  description="Whether lineage should be collected from copy commands",
130
130
  )
131
+ include_share_lineage: bool = Field(
132
+ default=True,
133
+ description="Whether lineage should be collected from datashares",
134
+ )
131
135
 
132
136
  include_usage_statistics: bool = Field(
133
137
  default=False,