acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (107) hide show
  1. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
  2. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
  3. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
  59. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  62. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  63. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  64. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  65. datahub/ingestion/source/sql/mssql/source.py +8 -4
  66. datahub/ingestion/source/sql/oracle.py +51 -4
  67. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  68. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  69. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  70. datahub/ingestion/source/superset.py +291 -35
  71. datahub/ingestion/source/usage/usage_common.py +0 -65
  72. datahub/ingestion/source/vertexai/__init__.py +0 -0
  73. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  74. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  75. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  76. datahub/metadata/_schema_classes.py +472 -1
  77. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  80. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  81. datahub/metadata/schema.avsc +313 -2
  82. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  83. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  84. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  85. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  86. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  87. datahub/metadata/schemas/Deprecation.avsc +2 -0
  88. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  89. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  90. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  91. datahub/metadata/schemas/Siblings.avsc +2 -0
  92. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  93. datahub/sdk/__init__.py +1 -0
  94. datahub/sdk/dataset.py +122 -0
  95. datahub/sdk/entity.py +99 -3
  96. datahub/sdk/entity_client.py +27 -3
  97. datahub/sdk/main_client.py +24 -1
  98. datahub/sdk/search_client.py +81 -8
  99. datahub/sdk/search_filters.py +94 -37
  100. datahub/sql_parsing/split_statements.py +17 -3
  101. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  102. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  103. datahub/testing/mcp_diff.py +1 -18
  104. datahub/utilities/threaded_iterator_executor.py +16 -3
  105. datahub/ingestion/source/vertexai.py +0 -697
  106. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  107. {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ from pyiceberg.types import (
38
38
  )
39
39
 
40
40
  from datahub.emitter.mce_builder import (
41
+ make_container_urn,
41
42
  make_data_platform_urn,
42
43
  make_dataplatform_instance_urn,
43
44
  make_dataset_urn_with_platform_instance,
@@ -45,6 +46,7 @@ from datahub.emitter.mce_builder import (
45
46
  make_user_urn,
46
47
  )
47
48
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
49
+ from datahub.emitter.mcp_builder import NamespaceKey
48
50
  from datahub.ingestion.api.common import PipelineContext
49
51
  from datahub.ingestion.api.decorators import (
50
52
  SourceCapability,
@@ -57,6 +59,10 @@ from datahub.ingestion.api.decorators import (
57
59
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
58
60
  from datahub.ingestion.api.workunit import MetadataWorkUnit
59
61
  from datahub.ingestion.extractor import schema_util
62
+ from datahub.ingestion.source.common.subtypes import (
63
+ DatasetContainerSubTypes,
64
+ DatasetSubTypes,
65
+ )
60
66
  from datahub.ingestion.source.iceberg.iceberg_common import (
61
67
  IcebergSourceConfig,
62
68
  IcebergSourceReport,
@@ -68,21 +74,22 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
68
74
  from datahub.ingestion.source.state.stateful_ingestion_base import (
69
75
  StatefulIngestionSourceBase,
70
76
  )
71
- from datahub.metadata.com.linkedin.pegasus2avro.common import Status
72
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
73
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
77
+ from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
78
+ from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
74
79
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
80
  OtherSchema,
76
81
  SchemaField,
77
82
  SchemaMetadata,
78
83
  )
79
84
  from datahub.metadata.schema_classes import (
85
+ ContainerClass,
80
86
  DataPlatformInstanceClass,
81
87
  DatasetPropertiesClass,
82
88
  OwnerClass,
83
89
  OwnershipClass,
84
90
  OwnershipTypeClass,
85
91
  TimeStampClass,
92
+ _Aspect,
86
93
  )
87
94
  from datahub.utilities.perf_timer import PerfTimer
88
95
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -121,9 +128,10 @@ class IcebergSource(StatefulIngestionSourceBase):
121
128
  [pyiceberg library](https://py.iceberg.apache.org/).
122
129
  """
123
130
 
131
+ platform: str = "iceberg"
132
+
124
133
  def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
125
134
  super().__init__(config, ctx)
126
- self.platform: str = "iceberg"
127
135
  self.report: IcebergSourceReport = IcebergSourceReport()
128
136
  self.config: IcebergSourceConfig = config
129
137
 
@@ -140,13 +148,12 @@ class IcebergSource(StatefulIngestionSourceBase):
140
148
  ).workunit_processor,
141
149
  ]
142
150
 
143
- def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]:
151
+ def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
144
152
  namespaces = catalog.list_namespaces()
145
153
  LOGGER.debug(
146
154
  f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
147
155
  )
148
156
  self.report.report_no_listed_namespaces(len(namespaces))
149
- tables_count = 0
150
157
  for namespace in namespaces:
151
158
  namespace_repr = ".".join(namespace)
152
159
  if not self.config.namespace_pattern.allowed(namespace_repr):
@@ -155,6 +162,14 @@ class IcebergSource(StatefulIngestionSourceBase):
155
162
  )
156
163
  self.report.report_dropped(f"{namespace_repr}.*")
157
164
  continue
165
+ yield namespace
166
+
167
+ def _get_datasets(
168
+ self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
169
+ ) -> Iterable[Tuple[Identifier, str]]:
170
+ LOGGER.debug("Starting to retrieve tables")
171
+ tables_count = 0
172
+ for namespace, namespace_urn in namespaces:
158
173
  try:
159
174
  tables = catalog.list_tables(namespace)
160
175
  tables_count += len(tables)
@@ -164,29 +179,27 @@ class IcebergSource(StatefulIngestionSourceBase):
164
179
  self.report.report_listed_tables_for_namespace(
165
180
  ".".join(namespace), len(tables)
166
181
  )
167
- yield from tables
168
- except NoSuchNamespaceError:
169
- self.report.report_warning(
170
- "no-such-namespace",
171
- f"Couldn't list tables for namespace {namespace} due to NoSuchNamespaceError exception",
172
- )
173
- LOGGER.warning(
174
- f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
182
+ yield from [(table, namespace_urn) for table in tables]
183
+ except NoSuchNamespaceError as e:
184
+ self.report.warning(
185
+ title="No such namespace",
186
+ message="Skipping the missing namespace.",
187
+ context=str(namespace),
188
+ exc=e,
175
189
  )
176
190
  except Exception as e:
177
191
  self.report.report_failure(
178
- "listing-tables-exception",
179
- f"Couldn't list tables for namespace {namespace} due to {e}",
180
- )
181
- LOGGER.exception(
182
- f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
192
+ title="Error when processing a namespace",
193
+ message="Skipping the namespace due to errors while processing it.",
194
+ context=str(namespace),
195
+ exc=e,
183
196
  )
184
197
 
185
198
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
186
199
  thread_local = threading.local()
187
200
 
188
201
  def _try_processing_dataset(
189
- dataset_path: Tuple[str, ...], dataset_name: str
202
+ dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
190
203
  ) -> Iterable[MetadataWorkUnit]:
191
204
  try:
192
205
  if not hasattr(thread_local, "local_catalog"):
@@ -202,56 +215,66 @@ class IcebergSource(StatefulIngestionSourceBase):
202
215
  time_taken, dataset_name, table.metadata_location
203
216
  )
204
217
  LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
205
- yield from self._create_iceberg_workunit(dataset_name, table)
206
- except NoSuchPropertyException as e:
207
- self.report.report_warning(
208
- "table-property-missing",
209
- f"Failed to create workunit for {dataset_name}. {e}",
218
+ dataset_urn: str = make_dataset_urn_with_platform_instance(
219
+ self.platform,
220
+ dataset_name,
221
+ self.config.platform_instance,
222
+ self.config.env,
210
223
  )
211
- LOGGER.warning(
212
- f"NoSuchPropertyException while processing table {dataset_path}, skipping it.",
224
+ for aspect in self._create_iceberg_table_aspects(
225
+ dataset_name, table, namespace_urn
226
+ ):
227
+ yield MetadataChangeProposalWrapper(
228
+ entityUrn=dataset_urn, aspect=aspect
229
+ ).as_workunit()
230
+ except NoSuchPropertyException as e:
231
+ self.report.warning(
232
+ title="Unable to process table",
233
+ message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
234
+ context=dataset_name,
235
+ exc=e,
213
236
  )
214
237
  except NoSuchIcebergTableError as e:
215
- self.report.report_warning(
216
- "not-an-iceberg-table",
217
- f"Failed to create workunit for {dataset_name}. {e}",
218
- )
219
- LOGGER.warning(
220
- f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
238
+ self.report.warning(
239
+ title="Skipped non-iceberg table",
240
+ message="Table was recognized as non-iceberg and skipped.",
241
+ context=dataset_name,
242
+ exc=e,
221
243
  )
222
244
  except NoSuchTableError as e:
223
- self.report.report_warning(
224
- "no-such-table",
225
- f"Failed to create workunit for {dataset_name}. {e}",
226
- )
227
- LOGGER.warning(
228
- f"NoSuchTableError while processing table {dataset_path}, skipping it.",
245
+ self.report.warning(
246
+ title="Table not found",
247
+ message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
248
+ context=dataset_name,
249
+ exc=e,
229
250
  )
230
251
  except FileNotFoundError as e:
231
- self.report.report_warning(
232
- "file-not-found",
233
- f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
234
- )
235
- LOGGER.warning(
236
- f"FileNotFoundError while processing table {dataset_path}, skipping it."
252
+ self.report.warning(
253
+ title="Manifest file not found",
254
+ message="Couldn't find manifest file to read for the table, skipping it.",
255
+ context=dataset_name,
256
+ exc=e,
237
257
  )
238
258
  except ServerError as e:
239
- self.report.report_warning(
240
- "iceberg-rest-server-error",
241
- f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
242
- )
243
- LOGGER.warning(
244
- f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
259
+ self.report.warning(
260
+ title="Iceberg REST Server Error",
261
+ message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
262
+ context=dataset_name,
263
+ exc=e,
245
264
  )
246
265
  except ValueError as e:
247
266
  if "Could not initialize FileIO" not in str(e):
248
267
  raise
249
268
  self.report.warning(
250
- "Could not initialize FileIO",
251
- f"Could not initialize FileIO for {dataset_path} due to: {e}",
269
+ title="Could not initialize FileIO",
270
+ message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
271
+ context=dataset_name,
272
+ exc=e,
252
273
  )
253
274
 
254
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
275
+ def _process_dataset(
276
+ dataset_path: Identifier, namespace_urn: str
277
+ ) -> Iterable[MetadataWorkUnit]:
255
278
  try:
256
279
  LOGGER.debug(f"Processing dataset for path {dataset_path}")
257
280
  dataset_name = ".".join(dataset_path)
@@ -263,106 +286,96 @@ class IcebergSource(StatefulIngestionSourceBase):
263
286
  )
264
287
  return
265
288
 
266
- yield from _try_processing_dataset(dataset_path, dataset_name)
289
+ yield from _try_processing_dataset(
290
+ dataset_path, dataset_name, namespace_urn
291
+ )
267
292
  except Exception as e:
268
293
  self.report.report_failure(
269
- "general",
270
- f"Failed to create workunit for dataset {dataset_path}: {e}",
271
- )
272
- LOGGER.exception(
273
- f"Exception while processing table {dataset_path}, skipping it.",
294
+ title="Error when processing a table",
295
+ message="Skipping the table due to errors when processing it.",
296
+ context=str(dataset_path),
297
+ exc=e,
274
298
  )
275
299
 
276
300
  try:
277
301
  catalog = self.config.get_catalog()
278
302
  except Exception as e:
279
- self.report.report_failure("get-catalog", f"Failed to get catalog: {e}")
303
+ self.report.report_failure(
304
+ title="Failed to initialize catalog object",
305
+ message="Couldn't start the ingestion due to failure to initialize catalog object.",
306
+ exc=e,
307
+ )
308
+ return
309
+
310
+ try:
311
+ namespace_ids = self._get_namespaces(catalog)
312
+ namespaces: List[Tuple[Identifier, str]] = []
313
+ for namespace in namespace_ids:
314
+ namespace_repr = ".".join(namespace)
315
+ LOGGER.debug(f"Processing namespace {namespace_repr}")
316
+ namespace_urn = make_container_urn(
317
+ NamespaceKey(
318
+ namespace=namespace_repr,
319
+ platform=self.platform,
320
+ instance=self.config.platform_instance,
321
+ env=self.config.env,
322
+ )
323
+ )
324
+ namespaces.append((namespace, namespace_urn))
325
+ for aspect in self._create_iceberg_namespace_aspects(namespace):
326
+ yield MetadataChangeProposalWrapper(
327
+ entityUrn=namespace_urn, aspect=aspect
328
+ ).as_workunit()
329
+ LOGGER.debug("Namespaces ingestion completed")
330
+ except Exception as e:
331
+ self.report.report_failure(
332
+ title="Failed to list namespaces",
333
+ message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
334
+ exc=e,
335
+ )
280
336
  return
281
337
 
282
338
  for wu in ThreadedIteratorExecutor.process(
283
339
  worker_func=_process_dataset,
284
- args_list=[(dataset_path,) for dataset_path in self._get_datasets(catalog)],
340
+ args_list=[
341
+ (dataset_path, namespace_urn)
342
+ for dataset_path, namespace_urn in self._get_datasets(
343
+ catalog, namespaces
344
+ )
345
+ ],
285
346
  max_workers=self.config.processing_threads,
286
347
  ):
287
348
  yield wu
288
349
 
289
- def _create_iceberg_workunit(
290
- self, dataset_name: str, table: Table
291
- ) -> Iterable[MetadataWorkUnit]:
350
+ def _create_iceberg_table_aspects(
351
+ self, dataset_name: str, table: Table, namespace_urn: str
352
+ ) -> Iterable[_Aspect]:
292
353
  with PerfTimer() as timer:
293
354
  self.report.report_table_scanned(dataset_name)
294
355
  LOGGER.debug(f"Processing table {dataset_name}")
295
- dataset_urn: str = make_dataset_urn_with_platform_instance(
296
- self.platform,
297
- dataset_name,
298
- self.config.platform_instance,
299
- self.config.env,
300
- )
301
- dataset_snapshot = DatasetSnapshot(
302
- urn=dataset_urn,
303
- aspects=[Status(removed=False)],
304
- )
356
+ yield Status(removed=False)
357
+ yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
305
358
 
306
- # Dataset properties aspect.
307
- additional_properties = {}
308
- custom_properties = table.metadata.properties.copy()
309
- custom_properties["location"] = table.metadata.location
310
- custom_properties["format-version"] = str(table.metadata.format_version)
311
- custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
312
- if table.current_snapshot():
313
- custom_properties["snapshot-id"] = str(
314
- table.current_snapshot().snapshot_id
315
- )
316
- custom_properties["manifest-list"] = (
317
- table.current_snapshot().manifest_list
318
- )
319
- additional_properties["lastModified"] = TimeStampClass(
320
- int(table.current_snapshot().timestamp_ms)
321
- )
322
- if "created-at" in custom_properties:
323
- try:
324
- dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
- additional_properties["created"] = TimeStampClass(
326
- int(dt.timestamp() * 1000)
327
- )
328
- except Exception as ex:
329
- LOGGER.warning(
330
- f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
- )
359
+ yield self._get_dataset_properties_aspect(dataset_name, table)
332
360
 
333
- dataset_properties = DatasetPropertiesClass(
334
- name=table.name()[-1],
335
- description=table.metadata.properties.get("comment", None),
336
- customProperties=custom_properties,
337
- lastModified=additional_properties.get("lastModified"),
338
- created=additional_properties.get("created"),
339
- qualifiedName=dataset_name,
340
- )
341
- dataset_snapshot.aspects.append(dataset_properties)
342
- # Dataset ownership aspect.
343
361
  dataset_ownership = self._get_ownership_aspect(table)
344
362
  if dataset_ownership:
345
363
  LOGGER.debug(
346
364
  f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
347
365
  )
348
- dataset_snapshot.aspects.append(dataset_ownership)
366
+ yield dataset_ownership
349
367
 
350
- schema_metadata = self._create_schema_metadata(dataset_name, table)
351
- dataset_snapshot.aspects.append(schema_metadata)
368
+ yield self._create_schema_metadata(dataset_name, table)
369
+ yield self._get_dataplatform_instance_aspect()
370
+ yield ContainerClass(container=str(namespace_urn))
352
371
 
353
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
354
372
  self.report.report_table_processing_time(
355
373
  timer.elapsed_seconds(), dataset_name, table.metadata_location
356
374
  )
357
- yield MetadataWorkUnit(id=dataset_name, mce=mce)
358
-
359
- dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
360
- if dpi_aspect:
361
- yield dpi_aspect
362
375
 
363
376
  if self.config.is_profiling_enabled():
364
377
  profiler = IcebergProfiler(self.report, self.config.profiling)
365
- yield from profiler.profile_table(dataset_name, dataset_urn, table)
378
+ yield from profiler.profile_table(dataset_name, table)
366
379
 
367
380
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
368
381
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
@@ -401,12 +414,48 @@ class IcebergSource(StatefulIngestionSourceBase):
401
414
  ]
402
415
  )
403
416
  except Exception as e:
404
- self.report.report_warning(
405
- "extract-partition",
406
- f"Failed to extract partition spec from Iceberg table {table.name()} due to error: {str(e)}",
417
+ self.report.warning(
418
+ title="Failed to extract partition information",
419
+ message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
420
+ context=str(table.name),
421
+ exc=e,
407
422
  )
408
423
  return None
409
424
 
425
+ def _get_dataset_properties_aspect(
426
+ self, dataset_name: str, table: Table
427
+ ) -> DatasetPropertiesClass:
428
+ additional_properties = {}
429
+ custom_properties = table.metadata.properties.copy()
430
+ custom_properties["location"] = table.metadata.location
431
+ custom_properties["format-version"] = str(table.metadata.format_version)
432
+ custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
433
+ if table.current_snapshot():
434
+ custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
435
+ custom_properties["manifest-list"] = table.current_snapshot().manifest_list
436
+ additional_properties["lastModified"] = TimeStampClass(
437
+ int(table.current_snapshot().timestamp_ms)
438
+ )
439
+ if "created-at" in custom_properties:
440
+ try:
441
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
442
+ additional_properties["created"] = TimeStampClass(
443
+ int(dt.timestamp() * 1000)
444
+ )
445
+ except Exception as ex:
446
+ LOGGER.warning(
447
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
448
+ )
449
+
450
+ return DatasetPropertiesClass(
451
+ name=table.name()[-1],
452
+ description=table.metadata.properties.get("comment", None),
453
+ customProperties=custom_properties,
454
+ lastModified=additional_properties.get("lastModified"),
455
+ created=additional_properties.get("created"),
456
+ qualifiedName=dataset_name,
457
+ )
458
+
410
459
  def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
411
460
  owners = []
412
461
  if self.config.user_ownership_property:
@@ -435,22 +484,15 @@ class IcebergSource(StatefulIngestionSourceBase):
435
484
  )
436
485
  return OwnershipClass(owners=owners) if owners else None
437
486
 
438
- def _get_dataplatform_instance_aspect(
439
- self, dataset_urn: str
440
- ) -> Optional[MetadataWorkUnit]:
441
- # If we are a platform instance based source, emit the instance aspect
442
- if self.config.platform_instance:
443
- return MetadataChangeProposalWrapper(
444
- entityUrn=dataset_urn,
445
- aspect=DataPlatformInstanceClass(
446
- platform=make_data_platform_urn(self.platform),
447
- instance=make_dataplatform_instance_urn(
448
- self.platform, self.config.platform_instance
449
- ),
450
- ),
451
- ).as_workunit()
452
-
453
- return None
487
+ def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
488
+ return DataPlatformInstanceClass(
489
+ platform=make_data_platform_urn(self.platform),
490
+ instance=make_dataplatform_instance_urn(
491
+ self.platform, self.config.platform_instance
492
+ )
493
+ if self.config.platform_instance
494
+ else None,
495
+ )
454
496
 
455
497
  def _create_schema_metadata(
456
498
  self, dataset_name: str, table: Table
@@ -479,6 +521,17 @@ class IcebergSource(StatefulIngestionSourceBase):
479
521
  def get_report(self) -> SourceReport:
480
522
  return self.report
481
523
 
524
+ def _create_iceberg_namespace_aspects(
525
+ self, namespace: Identifier
526
+ ) -> Iterable[_Aspect]:
527
+ namespace_repr = ".".join(namespace)
528
+ yield Status(removed=False)
529
+ yield ContainerProperties(
530
+ name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
531
+ )
532
+ yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
533
+ yield self._get_dataplatform_instance_aspect()
534
+
482
535
 
483
536
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
484
537
  """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Callable, Dict, Iterable, Union, cast
2
+ from typing import Any, Callable, Dict, Iterable, Optional, cast
3
3
 
4
4
  from pyiceberg.conversions import from_bytes
5
5
  from pyiceberg.schema import Schema
@@ -24,8 +24,6 @@ from pyiceberg.utils.datetime import (
24
24
  )
25
25
 
26
26
  from datahub.emitter.mce_builder import get_sys_time
27
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
- from datahub.ingestion.api.workunit import MetadataWorkUnit
29
27
  from datahub.ingestion.source.iceberg.iceberg_common import (
30
28
  IcebergProfilingConfig,
31
29
  IcebergSourceReport,
@@ -33,6 +31,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
33
31
  from datahub.metadata.schema_classes import (
34
32
  DatasetFieldProfileClass,
35
33
  DatasetProfileClass,
34
+ _Aspect,
36
35
  )
37
36
  from datahub.utilities.perf_timer import PerfTimer
38
37
 
@@ -86,9 +85,8 @@ class IcebergProfiler:
86
85
  def profile_table(
87
86
  self,
88
87
  dataset_name: str,
89
- dataset_urn: str,
90
88
  table: Table,
91
- ) -> Iterable[MetadataWorkUnit]:
89
+ ) -> Iterable[_Aspect]:
92
90
  """This method will profile the supplied Iceberg table by looking at the table's manifest.
93
91
 
94
92
  The overall profile of the table is aggregated from the individual manifest files.
@@ -167,11 +165,11 @@ class IcebergProfiler:
167
165
  )
168
166
  total_count += data_file.record_count
169
167
  except Exception as e:
170
- # Catch any errors that arise from attempting to read the Iceberg table's manifests
171
- # This will prevent stateful ingestion from being blocked by an error (profiling is not critical)
172
- self.report.report_warning(
173
- "profiling",
174
- f"Error while profiling dataset {dataset_name}: {e}",
168
+ self.report.warning(
169
+ title="Error when profiling a table",
170
+ message="Skipping profiling of the table due to errors",
171
+ context=dataset_name,
172
+ exc=e,
175
173
  )
176
174
  if row_count:
177
175
  # Iterating through fieldPaths introduces unwanted stats for list element fields...
@@ -211,14 +209,11 @@ class IcebergProfiler:
211
209
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
212
210
  )
213
211
 
214
- yield MetadataChangeProposalWrapper(
215
- entityUrn=dataset_urn,
216
- aspect=dataset_profile,
217
- ).as_workunit()
212
+ yield dataset_profile
218
213
 
219
214
  def _render_value(
220
215
  self, dataset_name: str, value_type: IcebergType, value: Any
221
- ) -> Union[str, None]:
216
+ ) -> Optional[str]:
222
217
  try:
223
218
  if isinstance(value_type, TimestampType):
224
219
  return to_human_timestamp(value)
@@ -230,9 +225,17 @@ class IcebergProfiler:
230
225
  return to_human_time(value)
231
226
  return str(value)
232
227
  except Exception as e:
233
- self.report.report_warning(
234
- "profiling",
235
- f"Error in dataset {dataset_name} when profiling a {value_type} field with value {value}: {e}",
228
+ self.report.warning(
229
+ title="Couldn't render value when profiling a table",
230
+ message="Encountered error, when trying to redner a value for table profile.",
231
+ context=str(
232
+ {
233
+ "value": value,
234
+ "value_type": value_type,
235
+ "dataset_name": dataset_name,
236
+ }
237
+ ),
238
+ exc=e,
236
239
  )
237
240
  return None
238
241