acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ import sys
9
9
  import threading
10
10
  import time
11
11
  from dataclasses import dataclass
12
- from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
12
+ from typing import Any, Dict, Iterable, Iterator, List, Optional
13
13
 
14
14
  import click
15
15
  import humanfriendly
@@ -26,7 +26,7 @@ from datahub.ingestion.api.common import EndOfStream, PipelineContext, RecordEnv
26
26
  from datahub.ingestion.api.global_context import set_graph_context
27
27
  from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
28
28
  from datahub.ingestion.api.report import Report
29
- from datahub.ingestion.api.sink import Sink, SinkReport, WriteCallback
29
+ from datahub.ingestion.api.sink import Sink, SinkReport
30
30
  from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
@@ -35,15 +35,15 @@ from datahub.ingestion.reporting.reporting_provider_registry import (
35
35
  reporting_provider_registry,
36
36
  )
37
37
  from datahub.ingestion.run.pipeline_config import PipelineConfig, ReporterConfig
38
+ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, LoggingCallback
38
39
  from datahub.ingestion.sink.datahub_rest import DatahubRestSink
39
- from datahub.ingestion.sink.file import FileSink, FileSinkConfig
40
40
  from datahub.ingestion.sink.sink_registry import sink_registry
41
41
  from datahub.ingestion.source.source_registry import source_registry
42
42
  from datahub.ingestion.transformer.system_metadata_transformer import (
43
43
  SystemMetadataTransformer,
44
44
  )
45
45
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
- from datahub.metadata.schema_classes import MetadataChangeProposalClass
46
+ from datahub.sdk._attribution import KnownAttribution, change_default_attribution
47
47
  from datahub.telemetry import stats
48
48
  from datahub.telemetry.telemetry import telemetry_instance
49
49
  from datahub.utilities._custom_package_loader import model_version_name
@@ -57,68 +57,6 @@ logger = logging.getLogger(__name__)
57
57
  _REPORT_PRINT_INTERVAL_SECONDS = 60
58
58
 
59
59
 
60
- class LoggingCallback(WriteCallback):
61
- def __init__(self, name: str = "") -> None:
62
- super().__init__()
63
- self.name = name
64
-
65
- def on_success(
66
- self, record_envelope: RecordEnvelope, success_metadata: dict
67
- ) -> None:
68
- logger.debug(
69
- f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
70
- )
71
-
72
- def on_failure(
73
- self,
74
- record_envelope: RecordEnvelope,
75
- failure_exception: Exception,
76
- failure_metadata: dict,
77
- ) -> None:
78
- logger.error(
79
- f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
80
- extra={"failure_metadata": failure_metadata},
81
- exc_info=failure_exception,
82
- )
83
-
84
-
85
- class DeadLetterQueueCallback(WriteCallback):
86
- def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
87
- if not config:
88
- config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
89
- self.file_sink: FileSink = FileSink(ctx, config)
90
- self.logging_callback = LoggingCallback(name="failure-queue")
91
- logger.info(f"Failure logging enabled. Will log to {config.filename}.")
92
-
93
- def on_success(
94
- self, record_envelope: RecordEnvelope, success_metadata: dict
95
- ) -> None:
96
- pass
97
-
98
- def on_failure(
99
- self,
100
- record_envelope: RecordEnvelope,
101
- failure_exception: Exception,
102
- failure_metadata: dict,
103
- ) -> None:
104
- if "workunit_id" in record_envelope.metadata:
105
- if isinstance(record_envelope.record, MetadataChangeProposalClass):
106
- mcp = cast(MetadataChangeProposalClass, record_envelope.record)
107
- if mcp.systemMetadata:
108
- if not mcp.systemMetadata.properties:
109
- mcp.systemMetadata.properties = {}
110
- if "workunit_id" not in mcp.systemMetadata.properties:
111
- # update the workunit id
112
- mcp.systemMetadata.properties["workunit_id"] = (
113
- record_envelope.metadata["workunit_id"]
114
- )
115
- record_envelope.record = mcp
116
- self.file_sink.write_record_async(record_envelope, self.logging_callback)
117
-
118
- def close(self) -> None:
119
- self.file_sink.close()
120
-
121
-
122
60
  class PipelineInitError(Exception):
123
61
  pass
124
62
 
@@ -236,76 +174,99 @@ class Pipeline:
236
174
  self.last_time_printed = int(time.time())
237
175
  self.cli_report = CliReport()
238
176
 
239
- self.graph = None
240
- with _add_init_error_context("connect to DataHub"):
241
- if self.config.datahub_api:
242
- self.graph = DataHubGraph(self.config.datahub_api)
243
- self.graph.test_connection()
244
-
245
- with _add_init_error_context("set up framework context"):
246
- self.ctx = PipelineContext(
247
- run_id=self.config.run_id,
248
- graph=self.graph,
249
- pipeline_name=self.config.pipeline_name,
250
- dry_run=dry_run,
251
- preview_mode=preview_mode,
252
- pipeline_config=self.config,
253
- )
254
-
255
- if self.config.sink is None:
256
- logger.info(
257
- "No sink configured, attempting to use the default datahub-rest sink."
258
- )
259
- with _add_init_error_context("configure the default rest sink"):
260
- self.sink_type = "datahub-rest"
261
- self.sink = _make_default_rest_sink(self.ctx)
262
- else:
263
- self.sink_type = self.config.sink.type
264
- with _add_init_error_context(
265
- f"find a registered sink for type {self.sink_type}"
266
- ):
267
- sink_class = sink_registry.get(self.sink_type)
268
-
269
- with _add_init_error_context(f"configure the sink ({self.sink_type})"):
270
- sink_config = self.config.sink.dict().get("config") or {}
271
- self.sink = sink_class.create(sink_config, self.ctx)
272
- logger.debug(f"Sink type {self.sink_type} ({sink_class}) configured")
273
- logger.info(f"Sink configured successfully. {self.sink.configured()}")
274
-
275
- if self.graph is None and isinstance(self.sink, DatahubRestSink):
276
- with _add_init_error_context("setup default datahub client"):
277
- self.graph = self.sink.emitter.to_graph()
278
- self.graph.test_connection()
279
- self.ctx.graph = self.graph
280
- telemetry_instance.set_context(server=self.graph)
281
-
282
- with set_graph_context(self.graph):
283
- with _add_init_error_context("configure reporters"):
284
- self._configure_reporting(report_to)
285
-
286
- with _add_init_error_context(
287
- f"find a registered source for type {self.source_type}"
288
- ):
289
- source_class = source_registry.get(self.source_type)
290
-
291
- with _add_init_error_context(f"configure the source ({self.source_type})"):
292
- self.source = source_class.create(
293
- self.config.source.dict().get("config", {}), self.ctx
294
- )
295
- logger.debug(
296
- f"Source type {self.source_type} ({source_class}) configured"
177
+ with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ self.graph: Optional[DataHubGraph] = None
179
+ with _add_init_error_context("connect to DataHub"):
180
+ if self.config.datahub_api:
181
+ self.graph = exit_stack.enter_context(
182
+ DataHubGraph(self.config.datahub_api)
183
+ )
184
+ self.graph.test_connection()
185
+
186
+ with _add_init_error_context("set up framework context"):
187
+ self.ctx = PipelineContext(
188
+ run_id=self.config.run_id,
189
+ graph=self.graph,
190
+ pipeline_name=self.config.pipeline_name,
191
+ dry_run=dry_run,
192
+ preview_mode=preview_mode,
193
+ pipeline_config=self.config,
297
194
  )
298
- logger.info("Source configured successfully.")
299
195
 
300
- extractor_type = self.config.source.extractor
301
- with _add_init_error_context(f"configure the extractor ({extractor_type})"):
302
- extractor_class = extractor_registry.get(extractor_type)
303
- self.extractor = extractor_class(
304
- self.config.source.extractor_config, self.ctx
196
+ if self.config.sink is None:
197
+ logger.info(
198
+ "No sink configured, attempting to use the default datahub-rest sink."
305
199
  )
200
+ with _add_init_error_context("configure the default rest sink"):
201
+ self.sink_type = "datahub-rest"
202
+ self.sink = exit_stack.enter_context(
203
+ _make_default_rest_sink(self.ctx)
204
+ )
205
+ else:
206
+ self.sink_type = self.config.sink.type
207
+ with _add_init_error_context(
208
+ f"find a registered sink for type {self.sink_type}"
209
+ ):
210
+ sink_class = sink_registry.get(self.sink_type)
211
+
212
+ with _add_init_error_context(f"configure the sink ({self.sink_type})"):
213
+ sink_config = self.config.sink.dict().get("config") or {}
214
+ self.sink = exit_stack.enter_context(
215
+ sink_class.create(sink_config, self.ctx)
216
+ )
217
+ logger.debug(
218
+ f"Sink type {self.sink_type} ({sink_class}) configured"
219
+ )
220
+ logger.info(f"Sink configured successfully. {self.sink.configured()}")
221
+
222
+ if self.graph is None and isinstance(self.sink, DatahubRestSink):
223
+ with _add_init_error_context("setup default datahub client"):
224
+ self.graph = self.sink.emitter.to_graph()
225
+ self.graph.test_connection()
226
+ self.ctx.graph = self.graph
227
+ telemetry_instance.set_context(server=self.graph)
228
+
229
+ with set_graph_context(self.graph):
230
+ with _add_init_error_context("configure reporters"):
231
+ self._configure_reporting(report_to)
306
232
 
307
- with _add_init_error_context("configure transformers"):
308
- self._configure_transforms()
233
+ with _add_init_error_context(
234
+ f"find a registered source for type {self.source_type}"
235
+ ):
236
+ source_class = source_registry.get(self.source_type)
237
+
238
+ with _add_init_error_context(
239
+ f"configure the source ({self.source_type})"
240
+ ):
241
+ self.source = inner_exit_stack.enter_context(
242
+ source_class.create(
243
+ self.config.source.dict().get("config", {}), self.ctx
244
+ )
245
+ )
246
+ logger.debug(
247
+ f"Source type {self.source_type} ({source_class}) configured"
248
+ )
249
+ logger.info("Source configured successfully.")
250
+
251
+ extractor_type = self.config.source.extractor
252
+ with _add_init_error_context(
253
+ f"configure the extractor ({extractor_type})"
254
+ ):
255
+ extractor_class = extractor_registry.get(extractor_type)
256
+ self.extractor = inner_exit_stack.enter_context(
257
+ extractor_class(self.config.source.extractor_config, self.ctx)
258
+ )
259
+
260
+ with _add_init_error_context("configure transformers"):
261
+ self._configure_transforms()
262
+
263
+ # If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
264
+ # We need to use an exit stack so that if we have an exception during initialization,
265
+ # things that were already initialized are still cleaned up.
266
+ # We need to separate the source/extractor from the rest because stateful
267
+ # ingestion requires the source to be closed before the state can be updated.
268
+ self.inner_exit_stack = inner_exit_stack.pop_all()
269
+ self.exit_stack = exit_stack.pop_all()
309
270
 
310
271
  @property
311
272
  def source_type(self) -> str:
@@ -439,18 +400,20 @@ class Pipeline:
439
400
  return True
440
401
  return False
441
402
 
442
- def run(self) -> None: # noqa: C901
443
- with contextlib.ExitStack() as stack:
403
+ def run(self) -> None:
404
+ with self.exit_stack, self.inner_exit_stack:
444
405
  if self.config.flags.generate_memory_profiles:
445
406
  import memray
446
407
 
447
- stack.enter_context(
408
+ self.exit_stack.enter_context(
448
409
  memray.Tracker(
449
410
  f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin"
450
411
  )
451
412
  )
452
413
 
453
- stack.enter_context(self.sink)
414
+ self.exit_stack.enter_context(
415
+ change_default_attribution(KnownAttribution.INGESTION)
416
+ )
454
417
 
455
418
  self.final_status = PipelineStatus.UNKNOWN
456
419
  self._notify_reporters_on_ingestion_start()
@@ -459,8 +422,10 @@ class Pipeline:
459
422
  callback = (
460
423
  LoggingCallback()
461
424
  if not self.config.failure_log.enabled
462
- else DeadLetterQueueCallback(
463
- self.ctx, self.config.failure_log.log_config
425
+ else self.exit_stack.enter_context(
426
+ DeadLetterQueueCallback(
427
+ self.ctx, self.config.failure_log.log_config
428
+ )
464
429
  )
465
430
  )
466
431
  for wu in itertools.islice(
@@ -506,12 +471,11 @@ class Pipeline:
506
471
  "Failed to process some records. Continuing.",
507
472
  exc_info=e,
508
473
  )
509
- # TODO: Transformer errors should cause the pipeline to fail.
474
+ # TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
510
475
 
511
476
  if not self.dry_run:
512
477
  self.sink.handle_work_unit_end(wu)
513
- self.extractor.close()
514
- self.source.close()
478
+
515
479
  # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
516
480
  for record_envelope in self.transform(
517
481
  [
@@ -527,6 +491,11 @@ class Pipeline:
527
491
  # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
528
492
  self.sink.write_record_async(record_envelope, callback)
529
493
 
494
+ # Stateful ingestion generates the updated state objects as part of the
495
+ # source's close method. Because of that, we need to close the source
496
+ # before we call process_commits.
497
+ self.inner_exit_stack.close()
498
+
530
499
  self.process_commits()
531
500
  self.final_status = PipelineStatus.COMPLETED
532
501
  except (SystemExit, KeyboardInterrupt) as e:
@@ -539,9 +508,6 @@ class Pipeline:
539
508
  finally:
540
509
  clear_global_warnings()
541
510
 
542
- if callback and hasattr(callback, "close"):
543
- callback.close() # type: ignore
544
-
545
511
  self._notify_reporters_on_ingestion_completion()
546
512
 
547
513
  def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
@@ -560,10 +526,8 @@ class Pipeline:
560
526
  Evaluates the commit_policy for each committable in the context and triggers the commit operation
561
527
  on the committable if its required commit policies are satisfied.
562
528
  """
563
- has_errors: bool = (
564
- True
565
- if self.source.get_report().failures or self.sink.get_report().failures
566
- else False
529
+ has_errors: bool = bool(
530
+ self.source.get_report().failures or self.sink.get_report().failures
567
531
  )
568
532
  has_warnings: bool = bool(
569
533
  self.source.get_report().warnings or self.sink.get_report().warnings
@@ -0,0 +1,77 @@
1
+ import logging
2
+ import threading
3
+ from typing import Optional
4
+
5
+ from datahub.ingestion.api.closeable import Closeable
6
+ from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
7
+ from datahub.ingestion.api.sink import WriteCallback
8
+ from datahub.ingestion.sink.file import FileSink, FileSinkConfig
9
+ from datahub.metadata.schema_classes import MetadataChangeProposalClass
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class LoggingCallback(WriteCallback):
15
+ def __init__(self, name: str = "") -> None:
16
+ super().__init__()
17
+ self.name = name
18
+
19
+ def on_success(
20
+ self, record_envelope: RecordEnvelope, success_metadata: dict
21
+ ) -> None:
22
+ logger.debug(
23
+ f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
24
+ )
25
+
26
+ def on_failure(
27
+ self,
28
+ record_envelope: RecordEnvelope,
29
+ failure_exception: Exception,
30
+ failure_metadata: dict,
31
+ ) -> None:
32
+ logger.error(
33
+ f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
34
+ extra={"failure_metadata": failure_metadata},
35
+ exc_info=failure_exception,
36
+ )
37
+
38
+
39
+ class DeadLetterQueueCallback(WriteCallback, Closeable):
40
+ def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
41
+ if not config:
42
+ config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
43
+ self.file_sink: FileSink = FileSink(ctx, config)
44
+ self.file_sink_lock = threading.Lock()
45
+ self.logging_callback = LoggingCallback(name="failure-queue")
46
+ logger.info(f"Failure logging enabled. Will log to {config.filename}.")
47
+
48
+ def on_success(
49
+ self, record_envelope: RecordEnvelope, success_metadata: dict
50
+ ) -> None:
51
+ pass
52
+
53
+ def on_failure(
54
+ self,
55
+ record_envelope: RecordEnvelope,
56
+ failure_exception: Exception,
57
+ failure_metadata: dict,
58
+ ) -> None:
59
+ if "workunit_id" in record_envelope.metadata and isinstance(
60
+ record_envelope.record, MetadataChangeProposalClass
61
+ ):
62
+ mcp: MetadataChangeProposalClass = record_envelope.record
63
+ if mcp.systemMetadata:
64
+ if not mcp.systemMetadata.properties:
65
+ mcp.systemMetadata.properties = {}
66
+ if "workunit_id" not in mcp.systemMetadata.properties:
67
+ # update the workunit id
68
+ mcp.systemMetadata.properties["workunit_id"] = (
69
+ record_envelope.metadata["workunit_id"]
70
+ )
71
+ record_envelope.record = mcp
72
+ with self.file_sink_lock:
73
+ self.file_sink.write_record_async(record_envelope, self.logging_callback)
74
+
75
+ def close(self) -> None:
76
+ with self.file_sink_lock:
77
+ self.file_sink.close()
@@ -20,7 +20,11 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
20
  from datahub.emitter.mcp_builder import mcps_from_mce
21
21
  from datahub.emitter.rest_emitter import (
22
22
  BATCH_INGEST_MAX_PAYLOAD_LENGTH,
23
+ DEFAULT_REST_SINK_ENDPOINT,
24
+ DEFAULT_REST_TRACE_MODE,
23
25
  DataHubRestEmitter,
26
+ RestSinkEndpoint,
27
+ RestTraceMode,
24
28
  )
25
29
  from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
26
30
  from datahub.ingestion.api.sink import (
@@ -66,6 +70,8 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
66
70
 
67
71
  class DatahubRestSinkConfig(DatahubClientConfig):
68
72
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
73
+ endpoint: RestSinkEndpoint = DEFAULT_REST_SINK_ENDPOINT
74
+ default_trace_mode: RestTraceMode = DEFAULT_REST_TRACE_MODE
69
75
 
70
76
  # These only apply in async modes.
71
77
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -172,6 +178,8 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
172
178
  ca_certificate_path=config.ca_certificate_path,
173
179
  client_certificate_path=config.client_certificate_path,
174
180
  disable_ssl_verification=config.disable_ssl_verification,
181
+ openapi_ingestion=config.endpoint == RestSinkEndpoint.OPENAPI,
182
+ default_trace_mode=config.default_trace_mode == RestTraceMode.ENABLED,
175
183
  )
176
184
 
177
185
  @property
@@ -144,10 +144,8 @@ class DataLakeSourceConfig(
144
144
  return path_specs
145
145
 
146
146
  @pydantic.validator("platform", always=True)
147
- def platform_not_empty(cls, platform: str, values: dict) -> str:
148
- inferred_platform = values.get(
149
- "platform", None
150
- ) # we may have inferred it above
147
+ def platform_not_empty(cls, platform: Any, values: dict) -> str:
148
+ inferred_platform = values.get("platform") # we may have inferred it above
151
149
  platform = platform or inferred_platform
152
150
  if not platform:
153
151
  raise ValueError("platform must not be empty")
@@ -165,7 +165,7 @@ class BigQueryTableRef:
165
165
  @classmethod
166
166
  def from_spec_obj(cls, spec: dict) -> "BigQueryTableRef":
167
167
  for key in ["projectId", "datasetId", "tableId"]:
168
- if key not in spec.keys():
168
+ if key not in spec:
169
169
  raise ValueError(f"invalid BigQuery table reference dict: {spec}")
170
170
 
171
171
  return cls(
@@ -1,8 +1,6 @@
1
- import json
2
1
  import logging
3
2
  import os
4
3
  import re
5
- import tempfile
6
4
  from datetime import timedelta
7
5
  from typing import Any, Dict, List, Optional, Union
8
6
 
@@ -17,10 +15,10 @@ from datahub.configuration.source_common import (
17
15
  PlatformInstanceConfigMixin,
18
16
  )
19
17
  from datahub.configuration.validate_field_removal import pydantic_removed_field
20
- from datahub.configuration.validate_multiline_string import pydantic_multiline_string
21
18
  from datahub.ingestion.glossary.classification_mixin import (
22
19
  ClassificationSourceConfigMixin,
23
20
  )
21
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
24
22
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
25
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
26
24
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -107,50 +105,8 @@ class BigQueryUsageConfig(BaseUsageConfig):
107
105
  )
108
106
 
109
107
 
110
- class BigQueryCredential(ConfigModel):
111
- project_id: str = Field(description="Project id to set the credentials")
112
- private_key_id: str = Field(description="Private key id")
113
- private_key: str = Field(
114
- description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
115
- )
116
- client_email: str = Field(description="Client email")
117
- client_id: str = Field(description="Client Id")
118
- auth_uri: str = Field(
119
- default="https://accounts.google.com/o/oauth2/auth",
120
- description="Authentication uri",
121
- )
122
- token_uri: str = Field(
123
- default="https://oauth2.googleapis.com/token", description="Token uri"
124
- )
125
- auth_provider_x509_cert_url: str = Field(
126
- default="https://www.googleapis.com/oauth2/v1/certs",
127
- description="Auth provider x509 certificate url",
128
- )
129
- type: str = Field(default="service_account", description="Authentication type")
130
- client_x509_cert_url: Optional[str] = Field(
131
- default=None,
132
- description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
133
- )
134
-
135
- _fix_private_key_newlines = pydantic_multiline_string("private_key")
136
-
137
- @root_validator(skip_on_failure=True)
138
- def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
139
- if values.get("client_x509_cert_url") is None:
140
- values["client_x509_cert_url"] = (
141
- f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
142
- )
143
- return values
144
-
145
- def create_credential_temp_file(self) -> str:
146
- with tempfile.NamedTemporaryFile(delete=False) as fp:
147
- cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": "))
148
- fp.write(cred_json.encode())
149
- return fp.name
150
-
151
-
152
108
  class BigQueryConnectionConfig(ConfigModel):
153
- credential: Optional[BigQueryCredential] = Field(
109
+ credential: Optional[GCPCredential] = Field(
154
110
  default=None, description="BigQuery credential informations"
155
111
  )
156
112
 
@@ -292,6 +292,11 @@ class BigQuerySchemaApi:
292
292
  if hasattr(d, "_properties") and isinstance(d._properties, dict)
293
293
  else None
294
294
  ),
295
+ # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
296
+ # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
297
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
298
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
299
+ comment=self.bq_client.get_dataset(d.reference).description,
295
300
  )
296
301
  for d in datasets
297
302
  ]
@@ -339,7 +344,7 @@ class BigQuerySchemaApi:
339
344
  with_partitions: bool = False,
340
345
  ) -> Iterator[BigqueryTable]:
341
346
  with PerfTimer() as current_timer:
342
- filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
347
+ filter_clause: str = ", ".join(f"'{table}'" for table in tables)
343
348
 
344
349
  if with_partitions:
345
350
  query_template = BigqueryQuery.tables_for_dataset
@@ -296,6 +296,7 @@ class BigQuerySchemaGenerator:
296
296
  self,
297
297
  dataset: str,
298
298
  project_id: str,
299
+ description: Optional[str] = None,
299
300
  tags: Optional[Dict[str, str]] = None,
300
301
  extra_properties: Optional[Dict[str, str]] = None,
301
302
  ) -> Iterable[MetadataWorkUnit]:
@@ -336,6 +337,7 @@ class BigQuerySchemaGenerator:
336
337
  domain_config=self.config.domain,
337
338
  schema_container_key=schema_container_key,
338
339
  database_container_key=database_container_key,
340
+ description=description,
339
341
  external_url=(
340
342
  BQ_EXTERNAL_DATASET_URL_TEMPLATE.format(
341
343
  project=project_id, dataset=dataset
@@ -471,14 +473,15 @@ class BigQuerySchemaGenerator:
471
473
 
472
474
  if self.config.include_schema_metadata:
473
475
  yield from self.gen_dataset_containers(
474
- dataset_name,
475
- project_id,
476
- bigquery_dataset.labels,
477
- (
476
+ dataset=dataset_name,
477
+ project_id=project_id,
478
+ tags=bigquery_dataset.labels,
479
+ extra_properties=(
478
480
  {"location": bigquery_dataset.location}
479
481
  if bigquery_dataset.location
480
482
  else None
481
483
  ),
484
+ description=bigquery_dataset.comment,
482
485
  )
483
486
 
484
487
  columns = None