acryl-datahub 1.2.0.8rc1__py3-none-any.whl → 1.2.0.8rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.8rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.8rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=Rdij3ffZjrkKXarGFXcv2MZfNld3LEFCYwjv7W_kgqg,323
4
+ datahub/_version.py,sha256=61ZxWUlQVKM0CF2BBOi-9OpFZENqh_B4oxFCZYQSJBc,323
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -366,7 +366,7 @@ datahub/ingestion/source/hex/mapper.py,sha256=IyDAE-TzZUji3ICI_9gkYC3dQN3gl6kERR
366
366
  datahub/ingestion/source/hex/model.py,sha256=eri4aRo1eXcE2SWjzCnPFMhzPTiJ8w8zC4GN7Lgpr74,1864
367
367
  datahub/ingestion/source/hex/query_fetcher.py,sha256=r9UvF_qwswkRlNY7AI8p46eqAYSxVtjVE2e7eO4XagA,13384
368
368
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
369
- datahub/ingestion/source/iceberg/iceberg.py,sha256=UWfI4sN5uO6f9KzxjY939a_BIkAnPf0ELCmFvf9KuYg,35427
369
+ datahub/ingestion/source/iceberg/iceberg.py,sha256=2E3mhvsIDSHDUd1Prb0nlZnGIsQLIuwNeFRxJPYyS-0,37042
370
370
  datahub/ingestion/source/iceberg/iceberg_common.py,sha256=CD_yHQ_wEgivyLQUTRO9BZJB29S7j5fUVllki-BPwUU,12292
371
371
  datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=9iwp2vpQTi4OMbIKoDZV5lAdvjMR0ls6Llpck9grJIE,9875
372
372
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -1114,8 +1114,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1114
1114
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1115
1115
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1116
1116
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1117
- acryl_datahub-1.2.0.8rc1.dist-info/METADATA,sha256=_TXC2AAKI66LHx6fTnBdVxsJBwedMdTKiIhnrBVSTQk,186651
1118
- acryl_datahub-1.2.0.8rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1119
- acryl_datahub-1.2.0.8rc1.dist-info/entry_points.txt,sha256=qopCAD6qrsijaZ9mTw3UlPCKsE00C3t9MbkkWow7pi4,9943
1120
- acryl_datahub-1.2.0.8rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1121
- acryl_datahub-1.2.0.8rc1.dist-info/RECORD,,
1117
+ acryl_datahub-1.2.0.8rc2.dist-info/METADATA,sha256=Q8mmqp92zb_C5PbYaI7zQiAwkw9QrX0FUiCAGxtbzzg,186651
1118
+ acryl_datahub-1.2.0.8rc2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1119
+ acryl_datahub-1.2.0.8rc2.dist-info/entry_points.txt,sha256=qopCAD6qrsijaZ9mTw3UlPCKsE00C3t9MbkkWow7pi4,9943
1120
+ acryl_datahub-1.2.0.8rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1121
+ acryl_datahub-1.2.0.8rc2.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.8rc1"
3
+ __version__ = "1.2.0.8rc2"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -12,7 +12,7 @@ from pyiceberg.exceptions import (
12
12
  NoSuchNamespaceError,
13
13
  NoSuchPropertyException,
14
14
  NoSuchTableError,
15
- ServerError,
15
+ RESTError,
16
16
  )
17
17
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
18
18
  from pyiceberg.table import Table
@@ -154,6 +154,10 @@ class IcebergSource(StatefulIngestionSourceBase):
154
154
  self.report: IcebergSourceReport = IcebergSourceReport()
155
155
  self.config: IcebergSourceConfig = config
156
156
  self.ctx: PipelineContext = ctx
157
+ self.stamping_processor = AutoSystemMetadata(
158
+ self.ctx
159
+ ) # single instance used only when processing namespaces
160
+ self.namespaces: List[Tuple[Identifier, str]] = []
157
161
 
158
162
  @classmethod
159
163
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -246,6 +250,13 @@ class IcebergSource(StatefulIngestionSourceBase):
246
250
  context=str(namespace),
247
251
  exc=e,
248
252
  )
253
+ except RESTError as e:
254
+ self.report.warning(
255
+ title="Iceberg REST Server Error",
256
+ message="Iceberg REST Server returned error status when trying to list tables for a namespace, skipping it.",
257
+ context=str(namespace),
258
+ exc=e,
259
+ )
249
260
  except Exception as e:
250
261
  self.report.report_failure(
251
262
  title="Error when processing a namespace",
@@ -322,10 +333,10 @@ class IcebergSource(StatefulIngestionSourceBase):
322
333
  context=dataset_name,
323
334
  exc=e,
324
335
  )
325
- except ServerError as e:
336
+ except RESTError as e:
326
337
  self.report.warning(
327
338
  title="Iceberg REST Server Error",
328
- message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
339
+ message="Iceberg REST Server returned error status when trying to process a table, skipping it.",
329
340
  context=dataset_name,
330
341
  exc=e,
331
342
  )
@@ -365,7 +376,7 @@ class IcebergSource(StatefulIngestionSourceBase):
365
376
  )
366
377
 
367
378
  try:
368
- catalog = self.config.get_catalog()
379
+ self.catalog = self.config.get_catalog()
369
380
  except Exception as e:
370
381
  self.report.report_failure(
371
382
  title="Failed to initialize catalog object",
@@ -375,33 +386,7 @@ class IcebergSource(StatefulIngestionSourceBase):
375
386
  return
376
387
 
377
388
  try:
378
- stamping_processor = AutoSystemMetadata(self.ctx)
379
- namespace_ids = self._get_namespaces(catalog)
380
- namespaces: List[Tuple[Identifier, str]] = []
381
- for namespace in namespace_ids:
382
- namespace_repr = ".".join(namespace)
383
- LOGGER.debug(f"Processing namespace {namespace_repr}")
384
- namespace_urn = make_container_urn(
385
- NamespaceKey(
386
- namespace=namespace_repr,
387
- platform=self.platform,
388
- instance=self.config.platform_instance,
389
- env=self.config.env,
390
- )
391
- )
392
- namespace_properties: Properties = catalog.load_namespace_properties(
393
- namespace
394
- )
395
- namespaces.append((namespace, namespace_urn))
396
- for aspect in self._create_iceberg_namespace_aspects(
397
- namespace, namespace_properties
398
- ):
399
- yield stamping_processor.stamp_wu(
400
- MetadataChangeProposalWrapper(
401
- entityUrn=namespace_urn, aspect=aspect
402
- ).as_workunit()
403
- )
404
- LOGGER.debug("Namespaces ingestion completed")
389
+ yield from self._process_namespaces()
405
390
  except Exception as e:
406
391
  self.report.report_failure(
407
392
  title="Failed to list namespaces",
@@ -415,13 +400,70 @@ class IcebergSource(StatefulIngestionSourceBase):
415
400
  args_list=[
416
401
  (dataset_path, namespace_urn)
417
402
  for dataset_path, namespace_urn in self._get_datasets(
418
- catalog, namespaces
403
+ self.catalog, self.namespaces
419
404
  )
420
405
  ],
421
406
  max_workers=self.config.processing_threads,
422
407
  ):
423
408
  yield wu
424
409
 
410
+ def _try_processing_namespace(
411
+ self, namespace: Identifier
412
+ ) -> Iterable[MetadataWorkUnit]:
413
+ namespace_repr = ".".join(namespace)
414
+ try:
415
+ LOGGER.debug(f"Processing namespace {namespace_repr}")
416
+ namespace_urn = make_container_urn(
417
+ NamespaceKey(
418
+ namespace=namespace_repr,
419
+ platform=self.platform,
420
+ instance=self.config.platform_instance,
421
+ env=self.config.env,
422
+ )
423
+ )
424
+
425
+ namespace_properties: Properties = self.catalog.load_namespace_properties(
426
+ namespace
427
+ )
428
+ for aspect in self._create_iceberg_namespace_aspects(
429
+ namespace, namespace_properties
430
+ ):
431
+ yield self.stamping_processor.stamp_wu(
432
+ MetadataChangeProposalWrapper(
433
+ entityUrn=namespace_urn, aspect=aspect
434
+ ).as_workunit()
435
+ )
436
+ self.namespaces.append((namespace, namespace_urn))
437
+ except NoSuchNamespaceError as e:
438
+ self.report.report_warning(
439
+ title="Failed to retrieve namespace properties",
440
+ message="Couldn't find the namespace, was it deleted during the ingestion?",
441
+ context=namespace_repr,
442
+ exc=e,
443
+ )
444
+ return
445
+ except RESTError as e:
446
+ self.report.warning(
447
+ title="Iceberg REST Server Error",
448
+ message="Iceberg REST Server returned error status when trying to retrieve namespace properties, skipping it.",
449
+ context=str(namespace),
450
+ exc=e,
451
+ )
452
+ except Exception as e:
453
+ self.report.report_failure(
454
+ title="Failed to process namespace",
455
+ message="Unhandled exception happened during processing of the namespace",
456
+ context=namespace_repr,
457
+ exc=e,
458
+ )
459
+
460
+ def _process_namespaces(self) -> Iterable[MetadataWorkUnit]:
461
+ namespace_ids = self._get_namespaces(self.catalog)
462
+ for namespace in namespace_ids:
463
+ yield from self._try_processing_namespace(namespace)
464
+
465
+ LOGGER.debug("Namespaces ingestion completed")
466
+
425
467
  def _create_iceberg_table_aspects(
426
468
  self, dataset_name: str, table: Table, namespace_urn: str
427
469
  ) -> Iterable[_Aspect]: