acryl-datahub 1.0.0rc5__py3-none-any.whl → 1.0.0rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/METADATA +2449 -2449
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/RECORD +72 -71
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/cli/iceberg_cli.py +1 -1
- datahub/cli/ingest_cli.py +3 -1
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/configuration/kafka.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/graph/client.py +15 -6
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/run/pipeline.py +109 -143
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/csv_enricher.py +2 -2
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +3 -3
- datahub/ingestion/source/dremio/dremio_aspects.py +2 -1
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/identity/okta.py +1 -2
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/lookml_source.py +1 -1
- datahub/ingestion/source/metabase.py +54 -32
- datahub/ingestion/source/mlflow.py +30 -7
- datahub/ingestion/source/mode.py +8 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
- datahub/ingestion/source/nifi.py +29 -6
- datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/redash.py +29 -6
- datahub/ingestion/source/s3/config.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -6
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/oracle.py +34 -0
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/tableau/tableau_common.py +2 -1
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +2 -1
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +517 -410
- datahub/metadata/_urns/urn_defs.py +1670 -1670
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +17362 -17638
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +29 -12
- datahub/sdk/_entity.py +18 -1
- datahub/sdk/container.py +3 -1
- datahub/sdk/dataset.py +5 -3
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc5.dist-info → acryl_datahub-1.0.0rc7.dist-info}/top_level.txt +0 -0
|
@@ -93,7 +93,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
93
93
|
try:
|
|
94
94
|
return response["linkAssetVersion"]["urn"]
|
|
95
95
|
except KeyError:
|
|
96
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
96
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
97
97
|
|
|
98
98
|
def link_asset_to_versioned_asset(
|
|
99
99
|
self,
|
|
@@ -165,7 +165,7 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
165
165
|
try:
|
|
166
166
|
return response["unlinkAssetVersion"]["urn"]
|
|
167
167
|
except KeyError:
|
|
168
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
168
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
169
169
|
|
|
170
170
|
def unlink_latest_asset_from_version_set(
|
|
171
171
|
self, version_set_urn: str
|
|
@@ -198,4 +198,4 @@ class EntityVersioningAPI(DataHubGraphProtocol):
|
|
|
198
198
|
try:
|
|
199
199
|
return response["unlinkAssetVersion"]["urn"]
|
|
200
200
|
except KeyError:
|
|
201
|
-
raise ValueError(f"Unexpected response: {response}")
|
|
201
|
+
raise ValueError(f"Unexpected response: {response}") from None
|
|
@@ -9,7 +9,7 @@ import sys
|
|
|
9
9
|
import threading
|
|
10
10
|
import time
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
-
from typing import Any, Dict, Iterable, Iterator, List, Optional
|
|
12
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional
|
|
13
13
|
|
|
14
14
|
import click
|
|
15
15
|
import humanfriendly
|
|
@@ -26,7 +26,7 @@ from datahub.ingestion.api.common import EndOfStream, PipelineContext, RecordEnv
|
|
|
26
26
|
from datahub.ingestion.api.global_context import set_graph_context
|
|
27
27
|
from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
|
|
28
28
|
from datahub.ingestion.api.report import Report
|
|
29
|
-
from datahub.ingestion.api.sink import Sink, SinkReport
|
|
29
|
+
from datahub.ingestion.api.sink import Sink, SinkReport
|
|
30
30
|
from datahub.ingestion.api.source import Extractor, Source
|
|
31
31
|
from datahub.ingestion.api.transform import Transformer
|
|
32
32
|
from datahub.ingestion.extractor.extractor_registry import extractor_registry
|
|
@@ -35,15 +35,15 @@ from datahub.ingestion.reporting.reporting_provider_registry import (
|
|
|
35
35
|
reporting_provider_registry,
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.run.pipeline_config import PipelineConfig, ReporterConfig
|
|
38
|
+
from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, LoggingCallback
|
|
38
39
|
from datahub.ingestion.sink.datahub_rest import DatahubRestSink
|
|
39
|
-
from datahub.ingestion.sink.file import FileSink, FileSinkConfig
|
|
40
40
|
from datahub.ingestion.sink.sink_registry import sink_registry
|
|
41
41
|
from datahub.ingestion.source.source_registry import source_registry
|
|
42
42
|
from datahub.ingestion.transformer.system_metadata_transformer import (
|
|
43
43
|
SystemMetadataTransformer,
|
|
44
44
|
)
|
|
45
45
|
from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
46
|
-
from datahub.
|
|
46
|
+
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
47
47
|
from datahub.telemetry import stats
|
|
48
48
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
49
49
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
@@ -57,68 +57,6 @@ logger = logging.getLogger(__name__)
|
|
|
57
57
|
_REPORT_PRINT_INTERVAL_SECONDS = 60
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
class LoggingCallback(WriteCallback):
|
|
61
|
-
def __init__(self, name: str = "") -> None:
|
|
62
|
-
super().__init__()
|
|
63
|
-
self.name = name
|
|
64
|
-
|
|
65
|
-
def on_success(
|
|
66
|
-
self, record_envelope: RecordEnvelope, success_metadata: dict
|
|
67
|
-
) -> None:
|
|
68
|
-
logger.debug(
|
|
69
|
-
f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
def on_failure(
|
|
73
|
-
self,
|
|
74
|
-
record_envelope: RecordEnvelope,
|
|
75
|
-
failure_exception: Exception,
|
|
76
|
-
failure_metadata: dict,
|
|
77
|
-
) -> None:
|
|
78
|
-
logger.error(
|
|
79
|
-
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
|
|
80
|
-
extra={"failure_metadata": failure_metadata},
|
|
81
|
-
exc_info=failure_exception,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class DeadLetterQueueCallback(WriteCallback):
|
|
86
|
-
def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
|
|
87
|
-
if not config:
|
|
88
|
-
config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
|
|
89
|
-
self.file_sink: FileSink = FileSink(ctx, config)
|
|
90
|
-
self.logging_callback = LoggingCallback(name="failure-queue")
|
|
91
|
-
logger.info(f"Failure logging enabled. Will log to {config.filename}.")
|
|
92
|
-
|
|
93
|
-
def on_success(
|
|
94
|
-
self, record_envelope: RecordEnvelope, success_metadata: dict
|
|
95
|
-
) -> None:
|
|
96
|
-
pass
|
|
97
|
-
|
|
98
|
-
def on_failure(
|
|
99
|
-
self,
|
|
100
|
-
record_envelope: RecordEnvelope,
|
|
101
|
-
failure_exception: Exception,
|
|
102
|
-
failure_metadata: dict,
|
|
103
|
-
) -> None:
|
|
104
|
-
if "workunit_id" in record_envelope.metadata:
|
|
105
|
-
if isinstance(record_envelope.record, MetadataChangeProposalClass):
|
|
106
|
-
mcp = cast(MetadataChangeProposalClass, record_envelope.record)
|
|
107
|
-
if mcp.systemMetadata:
|
|
108
|
-
if not mcp.systemMetadata.properties:
|
|
109
|
-
mcp.systemMetadata.properties = {}
|
|
110
|
-
if "workunit_id" not in mcp.systemMetadata.properties:
|
|
111
|
-
# update the workunit id
|
|
112
|
-
mcp.systemMetadata.properties["workunit_id"] = (
|
|
113
|
-
record_envelope.metadata["workunit_id"]
|
|
114
|
-
)
|
|
115
|
-
record_envelope.record = mcp
|
|
116
|
-
self.file_sink.write_record_async(record_envelope, self.logging_callback)
|
|
117
|
-
|
|
118
|
-
def close(self) -> None:
|
|
119
|
-
self.file_sink.close()
|
|
120
|
-
|
|
121
|
-
|
|
122
60
|
class PipelineInitError(Exception):
|
|
123
61
|
pass
|
|
124
62
|
|
|
@@ -236,76 +174,99 @@ class Pipeline:
|
|
|
236
174
|
self.last_time_printed = int(time.time())
|
|
237
175
|
self.cli_report = CliReport()
|
|
238
176
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
logger.info(
|
|
257
|
-
"No sink configured, attempting to use the default datahub-rest sink."
|
|
258
|
-
)
|
|
259
|
-
with _add_init_error_context("configure the default rest sink"):
|
|
260
|
-
self.sink_type = "datahub-rest"
|
|
261
|
-
self.sink = _make_default_rest_sink(self.ctx)
|
|
262
|
-
else:
|
|
263
|
-
self.sink_type = self.config.sink.type
|
|
264
|
-
with _add_init_error_context(
|
|
265
|
-
f"find a registered sink for type {self.sink_type}"
|
|
266
|
-
):
|
|
267
|
-
sink_class = sink_registry.get(self.sink_type)
|
|
268
|
-
|
|
269
|
-
with _add_init_error_context(f"configure the sink ({self.sink_type})"):
|
|
270
|
-
sink_config = self.config.sink.dict().get("config") or {}
|
|
271
|
-
self.sink = sink_class.create(sink_config, self.ctx)
|
|
272
|
-
logger.debug(f"Sink type {self.sink_type} ({sink_class}) configured")
|
|
273
|
-
logger.info(f"Sink configured successfully. {self.sink.configured()}")
|
|
274
|
-
|
|
275
|
-
if self.graph is None and isinstance(self.sink, DatahubRestSink):
|
|
276
|
-
with _add_init_error_context("setup default datahub client"):
|
|
277
|
-
self.graph = self.sink.emitter.to_graph()
|
|
278
|
-
self.graph.test_connection()
|
|
279
|
-
self.ctx.graph = self.graph
|
|
280
|
-
telemetry_instance.set_context(server=self.graph)
|
|
281
|
-
|
|
282
|
-
with set_graph_context(self.graph):
|
|
283
|
-
with _add_init_error_context("configure reporters"):
|
|
284
|
-
self._configure_reporting(report_to)
|
|
285
|
-
|
|
286
|
-
with _add_init_error_context(
|
|
287
|
-
f"find a registered source for type {self.source_type}"
|
|
288
|
-
):
|
|
289
|
-
source_class = source_registry.get(self.source_type)
|
|
290
|
-
|
|
291
|
-
with _add_init_error_context(f"configure the source ({self.source_type})"):
|
|
292
|
-
self.source = source_class.create(
|
|
293
|
-
self.config.source.dict().get("config", {}), self.ctx
|
|
294
|
-
)
|
|
295
|
-
logger.debug(
|
|
296
|
-
f"Source type {self.source_type} ({source_class}) configured"
|
|
177
|
+
with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
|
|
178
|
+
self.graph: Optional[DataHubGraph] = None
|
|
179
|
+
with _add_init_error_context("connect to DataHub"):
|
|
180
|
+
if self.config.datahub_api:
|
|
181
|
+
self.graph = exit_stack.enter_context(
|
|
182
|
+
DataHubGraph(self.config.datahub_api)
|
|
183
|
+
)
|
|
184
|
+
self.graph.test_connection()
|
|
185
|
+
|
|
186
|
+
with _add_init_error_context("set up framework context"):
|
|
187
|
+
self.ctx = PipelineContext(
|
|
188
|
+
run_id=self.config.run_id,
|
|
189
|
+
graph=self.graph,
|
|
190
|
+
pipeline_name=self.config.pipeline_name,
|
|
191
|
+
dry_run=dry_run,
|
|
192
|
+
preview_mode=preview_mode,
|
|
193
|
+
pipeline_config=self.config,
|
|
297
194
|
)
|
|
298
|
-
logger.info("Source configured successfully.")
|
|
299
195
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
self.extractor = extractor_class(
|
|
304
|
-
self.config.source.extractor_config, self.ctx
|
|
196
|
+
if self.config.sink is None:
|
|
197
|
+
logger.info(
|
|
198
|
+
"No sink configured, attempting to use the default datahub-rest sink."
|
|
305
199
|
)
|
|
200
|
+
with _add_init_error_context("configure the default rest sink"):
|
|
201
|
+
self.sink_type = "datahub-rest"
|
|
202
|
+
self.sink = exit_stack.enter_context(
|
|
203
|
+
_make_default_rest_sink(self.ctx)
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
self.sink_type = self.config.sink.type
|
|
207
|
+
with _add_init_error_context(
|
|
208
|
+
f"find a registered sink for type {self.sink_type}"
|
|
209
|
+
):
|
|
210
|
+
sink_class = sink_registry.get(self.sink_type)
|
|
211
|
+
|
|
212
|
+
with _add_init_error_context(f"configure the sink ({self.sink_type})"):
|
|
213
|
+
sink_config = self.config.sink.dict().get("config") or {}
|
|
214
|
+
self.sink = exit_stack.enter_context(
|
|
215
|
+
sink_class.create(sink_config, self.ctx)
|
|
216
|
+
)
|
|
217
|
+
logger.debug(
|
|
218
|
+
f"Sink type {self.sink_type} ({sink_class}) configured"
|
|
219
|
+
)
|
|
220
|
+
logger.info(f"Sink configured successfully. {self.sink.configured()}")
|
|
221
|
+
|
|
222
|
+
if self.graph is None and isinstance(self.sink, DatahubRestSink):
|
|
223
|
+
with _add_init_error_context("setup default datahub client"):
|
|
224
|
+
self.graph = self.sink.emitter.to_graph()
|
|
225
|
+
self.graph.test_connection()
|
|
226
|
+
self.ctx.graph = self.graph
|
|
227
|
+
telemetry_instance.set_context(server=self.graph)
|
|
306
228
|
|
|
307
|
-
with
|
|
308
|
-
|
|
229
|
+
with set_graph_context(self.graph):
|
|
230
|
+
with _add_init_error_context("configure reporters"):
|
|
231
|
+
self._configure_reporting(report_to)
|
|
232
|
+
|
|
233
|
+
with _add_init_error_context(
|
|
234
|
+
f"find a registered source for type {self.source_type}"
|
|
235
|
+
):
|
|
236
|
+
source_class = source_registry.get(self.source_type)
|
|
237
|
+
|
|
238
|
+
with _add_init_error_context(
|
|
239
|
+
f"configure the source ({self.source_type})"
|
|
240
|
+
):
|
|
241
|
+
self.source = inner_exit_stack.enter_context(
|
|
242
|
+
source_class.create(
|
|
243
|
+
self.config.source.dict().get("config", {}), self.ctx
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
logger.debug(
|
|
247
|
+
f"Source type {self.source_type} ({source_class}) configured"
|
|
248
|
+
)
|
|
249
|
+
logger.info("Source configured successfully.")
|
|
250
|
+
|
|
251
|
+
extractor_type = self.config.source.extractor
|
|
252
|
+
with _add_init_error_context(
|
|
253
|
+
f"configure the extractor ({extractor_type})"
|
|
254
|
+
):
|
|
255
|
+
extractor_class = extractor_registry.get(extractor_type)
|
|
256
|
+
self.extractor = inner_exit_stack.enter_context(
|
|
257
|
+
extractor_class(self.config.source.extractor_config, self.ctx)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
with _add_init_error_context("configure transformers"):
|
|
261
|
+
self._configure_transforms()
|
|
262
|
+
|
|
263
|
+
# If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
|
|
264
|
+
# We need to use an exit stack so that if we have an exception during initialization,
|
|
265
|
+
# things that were already initialized are still cleaned up.
|
|
266
|
+
# We need to separate the source/extractor from the rest because stateful
|
|
267
|
+
# ingestion requires the source to be closed before the state can be updated.
|
|
268
|
+
self.inner_exit_stack = inner_exit_stack.pop_all()
|
|
269
|
+
self.exit_stack = exit_stack.pop_all()
|
|
309
270
|
|
|
310
271
|
@property
|
|
311
272
|
def source_type(self) -> str:
|
|
@@ -440,17 +401,19 @@ class Pipeline:
|
|
|
440
401
|
return False
|
|
441
402
|
|
|
442
403
|
def run(self) -> None:
|
|
443
|
-
with
|
|
404
|
+
with self.exit_stack, self.inner_exit_stack:
|
|
444
405
|
if self.config.flags.generate_memory_profiles:
|
|
445
406
|
import memray
|
|
446
407
|
|
|
447
|
-
|
|
408
|
+
self.exit_stack.enter_context(
|
|
448
409
|
memray.Tracker(
|
|
449
410
|
f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin"
|
|
450
411
|
)
|
|
451
412
|
)
|
|
452
413
|
|
|
453
|
-
|
|
414
|
+
self.exit_stack.enter_context(
|
|
415
|
+
change_default_attribution(KnownAttribution.INGESTION)
|
|
416
|
+
)
|
|
454
417
|
|
|
455
418
|
self.final_status = PipelineStatus.UNKNOWN
|
|
456
419
|
self._notify_reporters_on_ingestion_start()
|
|
@@ -459,8 +422,10 @@ class Pipeline:
|
|
|
459
422
|
callback = (
|
|
460
423
|
LoggingCallback()
|
|
461
424
|
if not self.config.failure_log.enabled
|
|
462
|
-
else
|
|
463
|
-
|
|
425
|
+
else self.exit_stack.enter_context(
|
|
426
|
+
DeadLetterQueueCallback(
|
|
427
|
+
self.ctx, self.config.failure_log.log_config
|
|
428
|
+
)
|
|
464
429
|
)
|
|
465
430
|
)
|
|
466
431
|
for wu in itertools.islice(
|
|
@@ -506,12 +471,11 @@ class Pipeline:
|
|
|
506
471
|
"Failed to process some records. Continuing.",
|
|
507
472
|
exc_info=e,
|
|
508
473
|
)
|
|
509
|
-
# TODO: Transformer errors should
|
|
474
|
+
# TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
|
|
510
475
|
|
|
511
476
|
if not self.dry_run:
|
|
512
477
|
self.sink.handle_work_unit_end(wu)
|
|
513
|
-
|
|
514
|
-
self.source.close()
|
|
478
|
+
|
|
515
479
|
# no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
|
|
516
480
|
for record_envelope in self.transform(
|
|
517
481
|
[
|
|
@@ -527,6 +491,11 @@ class Pipeline:
|
|
|
527
491
|
# TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
|
|
528
492
|
self.sink.write_record_async(record_envelope, callback)
|
|
529
493
|
|
|
494
|
+
# Stateful ingestion generates the updated state objects as part of the
|
|
495
|
+
# source's close method. Because of that, we need to close the source
|
|
496
|
+
# before we call process_commits.
|
|
497
|
+
self.inner_exit_stack.close()
|
|
498
|
+
|
|
530
499
|
self.process_commits()
|
|
531
500
|
self.final_status = PipelineStatus.COMPLETED
|
|
532
501
|
except (SystemExit, KeyboardInterrupt) as e:
|
|
@@ -539,9 +508,6 @@ class Pipeline:
|
|
|
539
508
|
finally:
|
|
540
509
|
clear_global_warnings()
|
|
541
510
|
|
|
542
|
-
if callback and hasattr(callback, "close"):
|
|
543
|
-
callback.close() # type: ignore
|
|
544
|
-
|
|
545
511
|
self._notify_reporters_on_ingestion_completion()
|
|
546
512
|
|
|
547
513
|
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
6
|
+
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
|
7
|
+
from datahub.ingestion.api.sink import WriteCallback
|
|
8
|
+
from datahub.ingestion.sink.file import FileSink, FileSinkConfig
|
|
9
|
+
from datahub.metadata.schema_classes import MetadataChangeProposalClass
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LoggingCallback(WriteCallback):
|
|
15
|
+
def __init__(self, name: str = "") -> None:
|
|
16
|
+
super().__init__()
|
|
17
|
+
self.name = name
|
|
18
|
+
|
|
19
|
+
def on_success(
|
|
20
|
+
self, record_envelope: RecordEnvelope, success_metadata: dict
|
|
21
|
+
) -> None:
|
|
22
|
+
logger.debug(
|
|
23
|
+
f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def on_failure(
|
|
27
|
+
self,
|
|
28
|
+
record_envelope: RecordEnvelope,
|
|
29
|
+
failure_exception: Exception,
|
|
30
|
+
failure_metadata: dict,
|
|
31
|
+
) -> None:
|
|
32
|
+
logger.error(
|
|
33
|
+
f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
|
|
34
|
+
extra={"failure_metadata": failure_metadata},
|
|
35
|
+
exc_info=failure_exception,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DeadLetterQueueCallback(WriteCallback, Closeable):
|
|
40
|
+
def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
|
|
41
|
+
if not config:
|
|
42
|
+
config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
|
|
43
|
+
self.file_sink: FileSink = FileSink(ctx, config)
|
|
44
|
+
self.file_sink_lock = threading.Lock()
|
|
45
|
+
self.logging_callback = LoggingCallback(name="failure-queue")
|
|
46
|
+
logger.info(f"Failure logging enabled. Will log to {config.filename}.")
|
|
47
|
+
|
|
48
|
+
def on_success(
|
|
49
|
+
self, record_envelope: RecordEnvelope, success_metadata: dict
|
|
50
|
+
) -> None:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def on_failure(
|
|
54
|
+
self,
|
|
55
|
+
record_envelope: RecordEnvelope,
|
|
56
|
+
failure_exception: Exception,
|
|
57
|
+
failure_metadata: dict,
|
|
58
|
+
) -> None:
|
|
59
|
+
if "workunit_id" in record_envelope.metadata and isinstance(
|
|
60
|
+
record_envelope.record, MetadataChangeProposalClass
|
|
61
|
+
):
|
|
62
|
+
mcp: MetadataChangeProposalClass = record_envelope.record
|
|
63
|
+
if mcp.systemMetadata:
|
|
64
|
+
if not mcp.systemMetadata.properties:
|
|
65
|
+
mcp.systemMetadata.properties = {}
|
|
66
|
+
if "workunit_id" not in mcp.systemMetadata.properties:
|
|
67
|
+
# update the workunit id
|
|
68
|
+
mcp.systemMetadata.properties["workunit_id"] = (
|
|
69
|
+
record_envelope.metadata["workunit_id"]
|
|
70
|
+
)
|
|
71
|
+
record_envelope.record = mcp
|
|
72
|
+
with self.file_sink_lock:
|
|
73
|
+
self.file_sink.write_record_async(record_envelope, self.logging_callback)
|
|
74
|
+
|
|
75
|
+
def close(self) -> None:
|
|
76
|
+
with self.file_sink_lock:
|
|
77
|
+
self.file_sink.close()
|