acryl-datahub 1.0.0rc4__py3-none-any.whl → 1.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show
  1. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/METADATA +2502 -2502
  2. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/RECORD +62 -59
  3. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/cli/ingest_cli.py +3 -1
  6. datahub/emitter/mcp_builder.py +4 -1
  7. datahub/ingestion/api/source_helpers.py +4 -0
  8. datahub/ingestion/run/pipeline.py +109 -143
  9. datahub/ingestion/run/sink_callback.py +77 -0
  10. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -0
  11. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  12. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  13. datahub/ingestion/source/cassandra/cassandra_api.py +11 -4
  14. datahub/ingestion/source/delta_lake/config.py +8 -1
  15. datahub/ingestion/source/delta_lake/report.py +4 -2
  16. datahub/ingestion/source/delta_lake/source.py +20 -5
  17. datahub/ingestion/source/elastic_search.py +26 -6
  18. datahub/ingestion/source/feast.py +27 -8
  19. datahub/ingestion/source/file.py +1 -1
  20. datahub/ingestion/source/identity/okta.py +1 -2
  21. datahub/ingestion/source/mlflow.py +30 -7
  22. datahub/ingestion/source/mode.py +7 -2
  23. datahub/ingestion/source/neo4j/neo4j_source.py +26 -6
  24. datahub/ingestion/source/nifi.py +29 -6
  25. datahub/ingestion/source/openapi_parser.py +46 -14
  26. datahub/ingestion/source/powerbi_report_server/report_server.py +25 -6
  27. datahub/ingestion/source/pulsar.py +1 -0
  28. datahub/ingestion/source/redash.py +29 -6
  29. datahub/ingestion/source/s3/config.py +3 -1
  30. datahub/ingestion/source/salesforce.py +28 -6
  31. datahub/ingestion/source/slack/slack.py +31 -10
  32. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  33. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  34. datahub/ingestion/source/sql/oracle.py +34 -0
  35. datahub/ingestion/source_config/pulsar.py +3 -1
  36. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  37. datahub/metadata/_schema_classes.py +534 -410
  38. datahub/metadata/_urns/urn_defs.py +1670 -1670
  39. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  40. datahub/metadata/schema.avsc +17379 -17637
  41. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  42. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  43. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  44. datahub/metadata/schemas/MetadataChangeEvent.avsc +13 -0
  45. datahub/metadata/schemas/__init__.py +3 -3
  46. datahub/sdk/__init__.py +29 -12
  47. datahub/sdk/_attribution.py +4 -0
  48. datahub/sdk/_entity.py +20 -1
  49. datahub/sdk/_shared.py +163 -13
  50. datahub/sdk/_utils.py +35 -0
  51. datahub/sdk/container.py +23 -5
  52. datahub/sdk/dataset.py +109 -17
  53. datahub/sdk/main_client.py +17 -0
  54. datahub/specific/dataset.py +3 -4
  55. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  56. datahub/sql_parsing/split_statements.py +20 -13
  57. datahub/utilities/file_backed_collections.py +3 -14
  58. datahub/utilities/sentinels.py +22 -0
  59. datahub/utilities/unified_diff.py +5 -1
  60. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/LICENSE +0 -0
  61. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/entry_points.txt +0 -0
  62. {acryl_datahub-1.0.0rc4.dist-info → acryl_datahub-1.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ import sys
9
9
  import threading
10
10
  import time
11
11
  from dataclasses import dataclass
12
- from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
12
+ from typing import Any, Dict, Iterable, Iterator, List, Optional
13
13
 
14
14
  import click
15
15
  import humanfriendly
@@ -26,7 +26,7 @@ from datahub.ingestion.api.common import EndOfStream, PipelineContext, RecordEnv
26
26
  from datahub.ingestion.api.global_context import set_graph_context
27
27
  from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
28
28
  from datahub.ingestion.api.report import Report
29
- from datahub.ingestion.api.sink import Sink, SinkReport, WriteCallback
29
+ from datahub.ingestion.api.sink import Sink, SinkReport
30
30
  from datahub.ingestion.api.source import Extractor, Source
31
31
  from datahub.ingestion.api.transform import Transformer
32
32
  from datahub.ingestion.extractor.extractor_registry import extractor_registry
@@ -35,15 +35,15 @@ from datahub.ingestion.reporting.reporting_provider_registry import (
35
35
  reporting_provider_registry,
36
36
  )
37
37
  from datahub.ingestion.run.pipeline_config import PipelineConfig, ReporterConfig
38
+ from datahub.ingestion.run.sink_callback import DeadLetterQueueCallback, LoggingCallback
38
39
  from datahub.ingestion.sink.datahub_rest import DatahubRestSink
39
- from datahub.ingestion.sink.file import FileSink, FileSinkConfig
40
40
  from datahub.ingestion.sink.sink_registry import sink_registry
41
41
  from datahub.ingestion.source.source_registry import source_registry
42
42
  from datahub.ingestion.transformer.system_metadata_transformer import (
43
43
  SystemMetadataTransformer,
44
44
  )
45
45
  from datahub.ingestion.transformer.transform_registry import transform_registry
46
- from datahub.metadata.schema_classes import MetadataChangeProposalClass
46
+ from datahub.sdk._attribution import KnownAttribution, change_default_attribution
47
47
  from datahub.telemetry import stats
48
48
  from datahub.telemetry.telemetry import telemetry_instance
49
49
  from datahub.utilities._custom_package_loader import model_version_name
@@ -57,68 +57,6 @@ logger = logging.getLogger(__name__)
57
57
  _REPORT_PRINT_INTERVAL_SECONDS = 60
58
58
 
59
59
 
60
- class LoggingCallback(WriteCallback):
61
- def __init__(self, name: str = "") -> None:
62
- super().__init__()
63
- self.name = name
64
-
65
- def on_success(
66
- self, record_envelope: RecordEnvelope, success_metadata: dict
67
- ) -> None:
68
- logger.debug(
69
- f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
70
- )
71
-
72
- def on_failure(
73
- self,
74
- record_envelope: RecordEnvelope,
75
- failure_exception: Exception,
76
- failure_metadata: dict,
77
- ) -> None:
78
- logger.error(
79
- f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
80
- extra={"failure_metadata": failure_metadata},
81
- exc_info=failure_exception,
82
- )
83
-
84
-
85
- class DeadLetterQueueCallback(WriteCallback):
86
- def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
87
- if not config:
88
- config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
89
- self.file_sink: FileSink = FileSink(ctx, config)
90
- self.logging_callback = LoggingCallback(name="failure-queue")
91
- logger.info(f"Failure logging enabled. Will log to {config.filename}.")
92
-
93
- def on_success(
94
- self, record_envelope: RecordEnvelope, success_metadata: dict
95
- ) -> None:
96
- pass
97
-
98
- def on_failure(
99
- self,
100
- record_envelope: RecordEnvelope,
101
- failure_exception: Exception,
102
- failure_metadata: dict,
103
- ) -> None:
104
- if "workunit_id" in record_envelope.metadata:
105
- if isinstance(record_envelope.record, MetadataChangeProposalClass):
106
- mcp = cast(MetadataChangeProposalClass, record_envelope.record)
107
- if mcp.systemMetadata:
108
- if not mcp.systemMetadata.properties:
109
- mcp.systemMetadata.properties = {}
110
- if "workunit_id" not in mcp.systemMetadata.properties:
111
- # update the workunit id
112
- mcp.systemMetadata.properties["workunit_id"] = (
113
- record_envelope.metadata["workunit_id"]
114
- )
115
- record_envelope.record = mcp
116
- self.file_sink.write_record_async(record_envelope, self.logging_callback)
117
-
118
- def close(self) -> None:
119
- self.file_sink.close()
120
-
121
-
122
60
  class PipelineInitError(Exception):
123
61
  pass
124
62
 
@@ -236,76 +174,99 @@ class Pipeline:
236
174
  self.last_time_printed = int(time.time())
237
175
  self.cli_report = CliReport()
238
176
 
239
- self.graph = None
240
- with _add_init_error_context("connect to DataHub"):
241
- if self.config.datahub_api:
242
- self.graph = DataHubGraph(self.config.datahub_api)
243
- self.graph.test_connection()
244
-
245
- with _add_init_error_context("set up framework context"):
246
- self.ctx = PipelineContext(
247
- run_id=self.config.run_id,
248
- graph=self.graph,
249
- pipeline_name=self.config.pipeline_name,
250
- dry_run=dry_run,
251
- preview_mode=preview_mode,
252
- pipeline_config=self.config,
253
- )
254
-
255
- if self.config.sink is None:
256
- logger.info(
257
- "No sink configured, attempting to use the default datahub-rest sink."
258
- )
259
- with _add_init_error_context("configure the default rest sink"):
260
- self.sink_type = "datahub-rest"
261
- self.sink = _make_default_rest_sink(self.ctx)
262
- else:
263
- self.sink_type = self.config.sink.type
264
- with _add_init_error_context(
265
- f"find a registered sink for type {self.sink_type}"
266
- ):
267
- sink_class = sink_registry.get(self.sink_type)
268
-
269
- with _add_init_error_context(f"configure the sink ({self.sink_type})"):
270
- sink_config = self.config.sink.dict().get("config") or {}
271
- self.sink = sink_class.create(sink_config, self.ctx)
272
- logger.debug(f"Sink type {self.sink_type} ({sink_class}) configured")
273
- logger.info(f"Sink configured successfully. {self.sink.configured()}")
274
-
275
- if self.graph is None and isinstance(self.sink, DatahubRestSink):
276
- with _add_init_error_context("setup default datahub client"):
277
- self.graph = self.sink.emitter.to_graph()
278
- self.graph.test_connection()
279
- self.ctx.graph = self.graph
280
- telemetry_instance.set_context(server=self.graph)
281
-
282
- with set_graph_context(self.graph):
283
- with _add_init_error_context("configure reporters"):
284
- self._configure_reporting(report_to)
285
-
286
- with _add_init_error_context(
287
- f"find a registered source for type {self.source_type}"
288
- ):
289
- source_class = source_registry.get(self.source_type)
290
-
291
- with _add_init_error_context(f"configure the source ({self.source_type})"):
292
- self.source = source_class.create(
293
- self.config.source.dict().get("config", {}), self.ctx
294
- )
295
- logger.debug(
296
- f"Source type {self.source_type} ({source_class}) configured"
177
+ with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ self.graph: Optional[DataHubGraph] = None
179
+ with _add_init_error_context("connect to DataHub"):
180
+ if self.config.datahub_api:
181
+ self.graph = exit_stack.enter_context(
182
+ DataHubGraph(self.config.datahub_api)
183
+ )
184
+ self.graph.test_connection()
185
+
186
+ with _add_init_error_context("set up framework context"):
187
+ self.ctx = PipelineContext(
188
+ run_id=self.config.run_id,
189
+ graph=self.graph,
190
+ pipeline_name=self.config.pipeline_name,
191
+ dry_run=dry_run,
192
+ preview_mode=preview_mode,
193
+ pipeline_config=self.config,
297
194
  )
298
- logger.info("Source configured successfully.")
299
195
 
300
- extractor_type = self.config.source.extractor
301
- with _add_init_error_context(f"configure the extractor ({extractor_type})"):
302
- extractor_class = extractor_registry.get(extractor_type)
303
- self.extractor = extractor_class(
304
- self.config.source.extractor_config, self.ctx
196
+ if self.config.sink is None:
197
+ logger.info(
198
+ "No sink configured, attempting to use the default datahub-rest sink."
305
199
  )
200
+ with _add_init_error_context("configure the default rest sink"):
201
+ self.sink_type = "datahub-rest"
202
+ self.sink = exit_stack.enter_context(
203
+ _make_default_rest_sink(self.ctx)
204
+ )
205
+ else:
206
+ self.sink_type = self.config.sink.type
207
+ with _add_init_error_context(
208
+ f"find a registered sink for type {self.sink_type}"
209
+ ):
210
+ sink_class = sink_registry.get(self.sink_type)
211
+
212
+ with _add_init_error_context(f"configure the sink ({self.sink_type})"):
213
+ sink_config = self.config.sink.dict().get("config") or {}
214
+ self.sink = exit_stack.enter_context(
215
+ sink_class.create(sink_config, self.ctx)
216
+ )
217
+ logger.debug(
218
+ f"Sink type {self.sink_type} ({sink_class}) configured"
219
+ )
220
+ logger.info(f"Sink configured successfully. {self.sink.configured()}")
221
+
222
+ if self.graph is None and isinstance(self.sink, DatahubRestSink):
223
+ with _add_init_error_context("setup default datahub client"):
224
+ self.graph = self.sink.emitter.to_graph()
225
+ self.graph.test_connection()
226
+ self.ctx.graph = self.graph
227
+ telemetry_instance.set_context(server=self.graph)
306
228
 
307
- with _add_init_error_context("configure transformers"):
308
- self._configure_transforms()
229
+ with set_graph_context(self.graph):
230
+ with _add_init_error_context("configure reporters"):
231
+ self._configure_reporting(report_to)
232
+
233
+ with _add_init_error_context(
234
+ f"find a registered source for type {self.source_type}"
235
+ ):
236
+ source_class = source_registry.get(self.source_type)
237
+
238
+ with _add_init_error_context(
239
+ f"configure the source ({self.source_type})"
240
+ ):
241
+ self.source = inner_exit_stack.enter_context(
242
+ source_class.create(
243
+ self.config.source.dict().get("config", {}), self.ctx
244
+ )
245
+ )
246
+ logger.debug(
247
+ f"Source type {self.source_type} ({source_class}) configured"
248
+ )
249
+ logger.info("Source configured successfully.")
250
+
251
+ extractor_type = self.config.source.extractor
252
+ with _add_init_error_context(
253
+ f"configure the extractor ({extractor_type})"
254
+ ):
255
+ extractor_class = extractor_registry.get(extractor_type)
256
+ self.extractor = inner_exit_stack.enter_context(
257
+ extractor_class(self.config.source.extractor_config, self.ctx)
258
+ )
259
+
260
+ with _add_init_error_context("configure transformers"):
261
+ self._configure_transforms()
262
+
263
+ # If all of the initialization succeeds, we can preserve the exit stack until the pipeline run.
264
+ # We need to use an exit stack so that if we have an exception during initialization,
265
+ # things that were already initialized are still cleaned up.
266
+ # We need to separate the source/extractor from the rest because stateful
267
+ # ingestion requires the source to be closed before the state can be updated.
268
+ self.inner_exit_stack = inner_exit_stack.pop_all()
269
+ self.exit_stack = exit_stack.pop_all()
309
270
 
310
271
  @property
311
272
  def source_type(self) -> str:
@@ -440,17 +401,19 @@ class Pipeline:
440
401
  return False
441
402
 
442
403
  def run(self) -> None:
443
- with contextlib.ExitStack() as stack:
404
+ with self.exit_stack, self.inner_exit_stack:
444
405
  if self.config.flags.generate_memory_profiles:
445
406
  import memray
446
407
 
447
- stack.enter_context(
408
+ self.exit_stack.enter_context(
448
409
  memray.Tracker(
449
410
  f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin"
450
411
  )
451
412
  )
452
413
 
453
- stack.enter_context(self.sink)
414
+ self.exit_stack.enter_context(
415
+ change_default_attribution(KnownAttribution.INGESTION)
416
+ )
454
417
 
455
418
  self.final_status = PipelineStatus.UNKNOWN
456
419
  self._notify_reporters_on_ingestion_start()
@@ -459,8 +422,10 @@ class Pipeline:
459
422
  callback = (
460
423
  LoggingCallback()
461
424
  if not self.config.failure_log.enabled
462
- else DeadLetterQueueCallback(
463
- self.ctx, self.config.failure_log.log_config
425
+ else self.exit_stack.enter_context(
426
+ DeadLetterQueueCallback(
427
+ self.ctx, self.config.failure_log.log_config
428
+ )
464
429
  )
465
430
  )
466
431
  for wu in itertools.islice(
@@ -506,12 +471,11 @@ class Pipeline:
506
471
  "Failed to process some records. Continuing.",
507
472
  exc_info=e,
508
473
  )
509
- # TODO: Transformer errors should cause the pipeline to fail.
474
+ # TODO: Transformer errors should be reported more loudly / as part of the pipeline report.
510
475
 
511
476
  if not self.dry_run:
512
477
  self.sink.handle_work_unit_end(wu)
513
- self.extractor.close()
514
- self.source.close()
478
+
515
479
  # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
516
480
  for record_envelope in self.transform(
517
481
  [
@@ -527,6 +491,11 @@ class Pipeline:
527
491
  # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
528
492
  self.sink.write_record_async(record_envelope, callback)
529
493
 
494
+ # Stateful ingestion generates the updated state objects as part of the
495
+ # source's close method. Because of that, we need to close the source
496
+ # before we call process_commits.
497
+ self.inner_exit_stack.close()
498
+
530
499
  self.process_commits()
531
500
  self.final_status = PipelineStatus.COMPLETED
532
501
  except (SystemExit, KeyboardInterrupt) as e:
@@ -539,9 +508,6 @@ class Pipeline:
539
508
  finally:
540
509
  clear_global_warnings()
541
510
 
542
- if callback and hasattr(callback, "close"):
543
- callback.close() # type: ignore
544
-
545
511
  self._notify_reporters_on_ingestion_completion()
546
512
 
547
513
  def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
@@ -0,0 +1,77 @@
1
+ import logging
2
+ import threading
3
+ from typing import Optional
4
+
5
+ from datahub.ingestion.api.closeable import Closeable
6
+ from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
7
+ from datahub.ingestion.api.sink import WriteCallback
8
+ from datahub.ingestion.sink.file import FileSink, FileSinkConfig
9
+ from datahub.metadata.schema_classes import MetadataChangeProposalClass
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class LoggingCallback(WriteCallback):
15
+ def __init__(self, name: str = "") -> None:
16
+ super().__init__()
17
+ self.name = name
18
+
19
+ def on_success(
20
+ self, record_envelope: RecordEnvelope, success_metadata: dict
21
+ ) -> None:
22
+ logger.debug(
23
+ f"{self.name} sink wrote workunit {record_envelope.metadata['workunit_id']}"
24
+ )
25
+
26
+ def on_failure(
27
+ self,
28
+ record_envelope: RecordEnvelope,
29
+ failure_exception: Exception,
30
+ failure_metadata: dict,
31
+ ) -> None:
32
+ logger.error(
33
+ f"{self.name} failed to write record with workunit {record_envelope.metadata['workunit_id']}",
34
+ extra={"failure_metadata": failure_metadata},
35
+ exc_info=failure_exception,
36
+ )
37
+
38
+
39
+ class DeadLetterQueueCallback(WriteCallback, Closeable):
40
+ def __init__(self, ctx: PipelineContext, config: Optional[FileSinkConfig]) -> None:
41
+ if not config:
42
+ config = FileSinkConfig.parse_obj({"filename": "failed_events.json"})
43
+ self.file_sink: FileSink = FileSink(ctx, config)
44
+ self.file_sink_lock = threading.Lock()
45
+ self.logging_callback = LoggingCallback(name="failure-queue")
46
+ logger.info(f"Failure logging enabled. Will log to {config.filename}.")
47
+
48
+ def on_success(
49
+ self, record_envelope: RecordEnvelope, success_metadata: dict
50
+ ) -> None:
51
+ pass
52
+
53
+ def on_failure(
54
+ self,
55
+ record_envelope: RecordEnvelope,
56
+ failure_exception: Exception,
57
+ failure_metadata: dict,
58
+ ) -> None:
59
+ if "workunit_id" in record_envelope.metadata and isinstance(
60
+ record_envelope.record, MetadataChangeProposalClass
61
+ ):
62
+ mcp: MetadataChangeProposalClass = record_envelope.record
63
+ if mcp.systemMetadata:
64
+ if not mcp.systemMetadata.properties:
65
+ mcp.systemMetadata.properties = {}
66
+ if "workunit_id" not in mcp.systemMetadata.properties:
67
+ # update the workunit id
68
+ mcp.systemMetadata.properties["workunit_id"] = (
69
+ record_envelope.metadata["workunit_id"]
70
+ )
71
+ record_envelope.record = mcp
72
+ with self.file_sink_lock:
73
+ self.file_sink.write_record_async(record_envelope, self.logging_callback)
74
+
75
+ def close(self) -> None:
76
+ with self.file_sink_lock:
77
+ self.file_sink.close()
@@ -292,6 +292,11 @@ class BigQuerySchemaApi:
292
292
  if hasattr(d, "_properties") and isinstance(d._properties, dict)
293
293
  else None
294
294
  ),
295
+ # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
296
+ # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
297
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
298
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
299
+ comment=self.bq_client.get_dataset(d.reference).description,
295
300
  )
296
301
  for d in datasets
297
302
  ]
@@ -296,6 +296,7 @@ class BigQuerySchemaGenerator:
296
296
  self,
297
297
  dataset: str,
298
298
  project_id: str,
299
+ description: Optional[str] = None,
299
300
  tags: Optional[Dict[str, str]] = None,
300
301
  extra_properties: Optional[Dict[str, str]] = None,
301
302
  ) -> Iterable[MetadataWorkUnit]:
@@ -336,6 +337,7 @@ class BigQuerySchemaGenerator:
336
337
  domain_config=self.config.domain,
337
338
  schema_container_key=schema_container_key,
338
339
  database_container_key=database_container_key,
340
+ description=description,
339
341
  external_url=(
340
342
  BQ_EXTERNAL_DATASET_URL_TEMPLATE.format(
341
343
  project=project_id, dataset=dataset
@@ -471,14 +473,15 @@ class BigQuerySchemaGenerator:
471
473
 
472
474
  if self.config.include_schema_metadata:
473
475
  yield from self.gen_dataset_containers(
474
- dataset_name,
475
- project_id,
476
- bigquery_dataset.labels,
477
- (
476
+ dataset=dataset_name,
477
+ project_id=project_id,
478
+ tags=bigquery_dataset.labels,
479
+ extra_properties=(
478
480
  {"location": bigquery_dataset.location}
479
481
  if bigquery_dataset.location
480
482
  else None
481
483
  ),
484
+ description=bigquery_dataset.comment,
482
485
  )
483
486
 
484
487
  columns = None