acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (63) hide show
  1. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/METADATA +2693 -2630
  2. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/RECORD +63 -55
  3. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +45 -5
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/common/subtypes.py +3 -0
  11. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  12. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  13. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  14. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  15. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  16. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  17. datahub/ingestion/source/excel/__init__.py +0 -0
  18. datahub/ingestion/source/excel/config.py +92 -0
  19. datahub/ingestion/source/excel/excel_file.py +539 -0
  20. datahub/ingestion/source/excel/profiling.py +308 -0
  21. datahub/ingestion/source/excel/report.py +49 -0
  22. datahub/ingestion/source/excel/source.py +662 -0
  23. datahub/ingestion/source/excel/util.py +18 -0
  24. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  25. datahub/ingestion/source/openapi.py +1 -1
  26. datahub/ingestion/source/powerbi/config.py +33 -0
  27. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  28. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  29. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  30. datahub/ingestion/source/s3/source.py +65 -59
  31. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  32. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  33. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  34. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  35. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  36. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
  37. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  38. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  39. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  40. datahub/ingestion/source/sql_queries.py +24 -2
  41. datahub/ingestion/source/state/checkpoint.py +3 -28
  42. datahub/metadata/_internal_schema_classes.py +568 -512
  43. datahub/metadata/_urns/urn_defs.py +1748 -1748
  44. datahub/metadata/schema.avsc +18242 -18168
  45. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  46. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  47. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  48. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  49. datahub/metadata/schemas/Ownership.avsc +69 -0
  50. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  51. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  52. datahub/metadata/schemas/__init__.py +3 -3
  53. datahub/sdk/lineage_client.py +6 -26
  54. datahub/sdk/main_client.py +7 -3
  55. datahub/sdk/search_filters.py +16 -0
  56. datahub/specific/aspect_helpers/siblings.py +73 -0
  57. datahub/specific/dataset.py +2 -0
  58. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  59. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  60. datahub/upgrade/upgrade.py +14 -2
  61. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/WHEEL +0 -0
  62. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/licenses/LICENSE +0 -0
  63. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import os
3
3
  from collections import defaultdict
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime
6
- from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
6
+ from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
7
7
 
8
8
  from datahub.ingestion.api.report import SupportsAsObj
9
9
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
@@ -239,12 +239,16 @@ class _SnowflakeTagCache:
239
239
 
240
240
  class SnowflakeDataDictionary(SupportsAsObj):
241
241
  def __init__(
242
- self, connection: SnowflakeConnection, report: SnowflakeV2Report
242
+ self,
243
+ connection: SnowflakeConnection,
244
+ report: SnowflakeV2Report,
245
+ fetch_views_from_information_schema: bool = False,
243
246
  ) -> None:
244
247
  self.connection = connection
245
248
  self.report = report
249
+ self._fetch_views_from_information_schema = fetch_views_from_information_schema
246
250
 
247
- def as_obj(self) -> Dict[str, Dict[str, int]]:
251
+ def as_obj(self) -> Dict[str, Any]:
248
252
  # TODO: Move this into a proper report type that gets computed.
249
253
 
250
254
  # Reports how many times we reset in-memory `functools.lru_cache` caches of data,
@@ -260,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
260
264
  self.get_fk_constraints_for_schema,
261
265
  ]
262
266
 
263
- report = {}
267
+ report: Dict[str, Any] = {
268
+ "fetch_views_from_information_schema": self._fetch_views_from_information_schema,
269
+ }
264
270
  for func in lru_cache_functions:
265
271
  report[func.__name__] = func.cache_info()._asdict() # type: ignore
266
272
  return report
@@ -430,7 +436,17 @@ class SnowflakeDataDictionary(SupportsAsObj):
430
436
  return tables
431
437
 
432
438
  @serialized_lru_cache(maxsize=1)
433
- def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]:
439
+ def get_views_for_database(
440
+ self, db_name: str
441
+ ) -> Optional[Dict[str, List[SnowflakeView]]]:
442
+ if self._fetch_views_from_information_schema:
443
+ return self._get_views_for_database_using_information_schema(db_name)
444
+ else:
445
+ return self._get_views_for_database_using_show(db_name)
446
+
447
+ def _get_views_for_database_using_show(
448
+ self, db_name: str
449
+ ) -> Dict[str, List[SnowflakeView]]:
434
450
  page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
435
451
 
436
452
  views: Dict[str, List[SnowflakeView]] = {}
@@ -461,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
461
477
  SnowflakeView(
462
478
  name=view_name,
463
479
  created=view["created_on"],
464
- # last_altered=table["last_altered"],
465
480
  comment=view["comment"],
466
481
  view_definition=view["text"],
467
- last_altered=view["created_on"],
482
+ last_altered=view["created_on"], # TODO: This is not correct.
468
483
  materialized=(
469
484
  view.get("is_materialized", "false").lower() == "true"
470
485
  ),
@@ -479,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
479
494
  )
480
495
  view_pagination_marker = view_name
481
496
 
497
+ # Because this is in a cached function, this will only log once per database.
498
+ view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
499
+ logger.info(
500
+ f"Finished fetching views in {db_name}; counts by schema {view_counts}"
501
+ )
502
+ return views
503
+
504
+ def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
505
+ schema_name = row["VIEW_SCHEMA"]
506
+ view_definition = row.get("VIEW_DEFINITION")
507
+ fragment_view_definition = (
508
+ view_definition[:50].strip() if view_definition else None
509
+ )
510
+ logger.info(
511
+ f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
512
+ )
513
+
514
+ return schema_name, SnowflakeView(
515
+ name=row["VIEW_NAME"],
516
+ created=row["CREATED"],
517
+ comment=row["COMMENT"],
518
+ view_definition=view_definition,
519
+ last_altered=row["LAST_ALTERED"],
520
+ is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
521
+ # TODO: This doesn't work for materialized views.
522
+ materialized=False,
523
+ )
524
+
525
+ def _maybe_populate_empty_view_definitions(
526
+ self,
527
+ db_name: str,
528
+ schema_name: str,
529
+ views_with_empty_definition: List[SnowflakeView],
530
+ ) -> List[SnowflakeView]:
531
+ if not views_with_empty_definition:
532
+ return []
533
+
534
+ view_names = [view.name for view in views_with_empty_definition]
535
+ batches = [
536
+ batch[0]
537
+ for batch in build_prefix_batches(
538
+ view_names, max_batch_size=1000, max_groups_in_batch=1
539
+ )
540
+ if batch
541
+ # Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
542
+ ]
543
+
544
+ view_map: Dict[str, SnowflakeView] = {
545
+ view.name: view for view in views_with_empty_definition
546
+ }
547
+ views_found_count = 0
548
+
549
+ logger.info(
550
+ f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
551
+ f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
552
+ )
553
+
554
+ for batch_index, prefix_group in enumerate(batches):
555
+ query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
556
+ logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
557
+
558
+ try:
559
+ cur = self.connection.query(query)
560
+ for row in cur:
561
+ view_name = row["name"]
562
+ if view_name in view_map:
563
+ view_definition = row.get("text")
564
+ if view_definition: # Ensure definition is not None or empty
565
+ view_map[view_name].view_definition = view_definition
566
+ views_found_count += 1
567
+ logger.debug(
568
+ f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
569
+ )
570
+ # If all targeted views are found, we could theoretically break early,
571
+ # but SHOW VIEWS doesn't guarantee order, so we must process all results.
572
+ else:
573
+ logger.warning(
574
+ f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
575
+ )
576
+
577
+ except Exception as e:
578
+ logger.error(
579
+ f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
580
+ exc_info=e,
581
+ )
582
+ # Returning the original list; some views might still be missing definitions.
583
+ # This also means subsequent batches for this schema (in this call) are skipped.
584
+ return views_with_empty_definition
585
+
586
+ logger.info(
587
+ f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
588
+ f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
589
+ )
590
+
591
+ if views_found_count < len(view_map):
592
+ missing_count = len(view_map) - views_found_count
593
+ logger.warning(
594
+ f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
595
+ )
596
+ # The SnowflakeView objects in the original list were modified in place via view_map
597
+ return views_with_empty_definition
598
+
599
+ def _get_views_for_database_using_information_schema(
600
+ self, db_name: str
601
+ ) -> Optional[Dict[str, List[SnowflakeView]]]:
602
+ try:
603
+ cur = self.connection.query(
604
+ SnowflakeQuery.get_views_for_database(db_name),
605
+ )
606
+ except Exception as e:
607
+ logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
608
+ # Error - Information schema query returned too much data. Please repeat query with more selective predicates.
609
+ return None
610
+
611
+ views: Dict[str, List[SnowflakeView]] = {}
612
+ views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
613
+
614
+ for row in cur:
615
+ schema_name, view = self._map_view(db_name, row)
616
+ if view.view_definition is None or view.view_definition == "":
617
+ views_with_empty_definition.setdefault(schema_name, []).append(view)
618
+ else:
619
+ views.setdefault(schema_name, []).append(view)
620
+
621
+ for schema_name, empty_views in views_with_empty_definition.items():
622
+ updated_views = self._maybe_populate_empty_view_definitions(
623
+ db_name, schema_name, empty_views
624
+ )
625
+ views.setdefault(schema_name, []).extend(updated_views)
626
+
627
+ return views
628
+
629
+ def get_views_for_schema_using_information_schema(
630
+ self, *, schema_name: str, db_name: str
631
+ ) -> List[SnowflakeView]:
632
+ cur = self.connection.query(
633
+ SnowflakeQuery.get_views_for_schema(
634
+ db_name=db_name, schema_name=schema_name
635
+ ),
636
+ )
637
+
638
+ views: List[SnowflakeView] = []
639
+ views_with_empty_definition: List[SnowflakeView] = []
640
+
641
+ for row in cur:
642
+ schema_name, view = self._map_view(db_name, row)
643
+ if view.view_definition is None or view.view_definition == "":
644
+ views_with_empty_definition.append(view)
645
+ else:
646
+ views.append(view)
647
+
648
+ if views_with_empty_definition:
649
+ updated_empty_views = self._maybe_populate_empty_view_definitions(
650
+ db_name, schema_name, views_with_empty_definition
651
+ )
652
+ views.extend(updated_empty_views)
653
+
482
654
  return views
483
655
 
484
656
  @serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
@@ -166,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
166
166
 
167
167
  def __init__(
168
168
  self,
169
- config: SnowflakeV2Config,
170
- report: SnowflakeV2Report,
169
+ config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
170
+ report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
171
171
  connection: SnowflakeConnection,
172
172
  filters: SnowflakeFilter,
173
173
  identifiers: SnowflakeIdentifierBuilder,
@@ -175,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
175
175
  profiler: Optional[SnowflakeProfiler],
176
176
  aggregator: Optional[SqlParsingAggregator],
177
177
  snowsight_url_builder: Optional[SnowsightUrlBuilder],
178
+ fetch_views_from_information_schema: bool = False,
178
179
  ) -> None:
179
180
  self.config: SnowflakeV2Config = config
180
181
  self.report: SnowflakeV2Report = report
@@ -183,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
183
184
  self.identifiers: SnowflakeIdentifierBuilder = identifiers
184
185
 
185
186
  self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
186
- connection=self.connection, report=self.report
187
+ connection=self.connection,
188
+ report=self.report,
189
+ fetch_views_from_information_schema=fetch_views_from_information_schema,
187
190
  )
188
191
  self.report.data_dictionary_cache = self.data_dictionary
189
192
 
@@ -1241,7 +1244,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1241
1244
  # falling back to get tables for schema
1242
1245
  if tables is None:
1243
1246
  self.report.num_get_tables_for_schema_queries += 1
1244
- return self.data_dictionary.get_tables_for_schema(schema_name, db_name)
1247
+ return self.data_dictionary.get_tables_for_schema(
1248
+ db_name=db_name,
1249
+ schema_name=schema_name,
1250
+ )
1245
1251
 
1246
1252
  # Some schema may not have any table
1247
1253
  return tables.get(schema_name, [])
@@ -1251,8 +1257,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1251
1257
  ) -> List[SnowflakeView]:
1252
1258
  views = self.data_dictionary.get_views_for_database(db_name)
1253
1259
 
1254
- # Some schema may not have any table
1255
- return views.get(schema_name, [])
1260
+ if views is not None:
1261
+ # Some schemas may not have any views
1262
+ return views.get(schema_name, [])
1263
+
1264
+ # Usually this fails when there are too many views in the schema.
1265
+ # Fall back to per-schema queries.
1266
+ self.report.num_get_views_for_schema_queries += 1
1267
+ return self.data_dictionary.get_views_for_schema_using_information_schema(
1268
+ db_name=db_name,
1269
+ schema_name=schema_name,
1270
+ )
1256
1271
 
1257
1272
  def get_columns_for_table(
1258
1273
  self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
@@ -86,6 +86,7 @@ class SnowflakeSummarySource(Source):
86
86
  filter_config=self.config,
87
87
  structured_reporter=self.report,
88
88
  ),
89
+ fetch_views_from_information_schema=False, # we haven't enabled this config for SnowflakeSummarySource
89
90
  )
90
91
 
91
92
  # Databases.
@@ -172,7 +172,9 @@ class SnowflakeV2Source(
172
172
 
173
173
  # For database, schema, tables, views, etc
174
174
  self.data_dictionary = SnowflakeDataDictionary(
175
- connection=self.connection, report=self.report
175
+ connection=self.connection,
176
+ report=self.report,
177
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
176
178
  )
177
179
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
178
180
 
@@ -528,6 +530,7 @@ class SnowflakeV2Source(
528
530
  snowsight_url_builder=snowsight_url_builder,
529
531
  filters=self.filters,
530
532
  identifiers=self.identifiers,
533
+ fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
531
534
  )
532
535
 
533
536
  with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
@@ -174,6 +174,7 @@ class HiveMetastore(BasicSQLAlchemyConfig):
174
174
  "Enabled by default",
175
175
  subtype_modifier=[
176
176
  SourceCapabilityModifier.CATALOG,
177
+ SourceCapabilityModifier.SCHEMA,
177
178
  ],
178
179
  )
179
180
  class HiveMetastoreSource(SQLAlchemySource):
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
25
25
  platform_name,
26
26
  support_status,
27
27
  )
28
+ from datahub.ingestion.api.incremental_lineage_helper import (
29
+ IncrementalLineageConfigMixin,
30
+ auto_incremental_lineage,
31
+ )
28
32
  from datahub.ingestion.api.source import (
29
33
  MetadataWorkUnitProcessor,
30
34
  Source,
@@ -48,7 +52,9 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
48
52
  logger = logging.getLogger(__name__)
49
53
 
50
54
 
51
- class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
55
+ class SqlQueriesSourceConfig(
56
+ PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
57
+ ):
52
58
  query_file: str = Field(description="Path to file to ingest")
53
59
 
54
60
  platform: str = Field(
@@ -109,6 +115,16 @@ class SqlQueriesSource(Source):
109
115
  used if the query can't be parsed.
110
116
  - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
111
117
  used if the query can't be parsed.
118
+
119
+ ### Incremental Lineage
120
+ When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
121
+ This allows you to add lineage edges without removing existing ones, which is useful for:
122
+ - Gradually building up lineage from multiple sources
123
+ - Preserving manually curated lineage
124
+ - Avoiding conflicts when multiple ingestion processes target the same datasets
125
+
126
+ Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
127
+ statistics will still be emitted normally.
112
128
  """
113
129
 
114
130
  schema_resolver: Optional[SchemaResolver]
@@ -165,7 +181,13 @@ class SqlQueriesSource(Source):
165
181
  return self.report
166
182
 
167
183
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
168
- return [partial(auto_workunit_reporter, self.get_report())]
184
+ return [
185
+ partial(auto_workunit_reporter, self.get_report()),
186
+ partial(
187
+ auto_incremental_lineage,
188
+ self.config.incremental_lineage,
189
+ ),
190
+ ]
169
191
 
170
192
  def get_workunits_internal(
171
193
  self,
@@ -1,10 +1,8 @@
1
1
  import base64
2
2
  import bz2
3
- import contextlib
4
3
  import functools
5
4
  import json
6
5
  import logging
7
- import pickle
8
6
  from dataclasses import dataclass
9
7
  from datetime import datetime, timezone
10
8
  from typing import Callable, Generic, Optional, Type, TypeVar
@@ -117,10 +115,9 @@ class Checkpoint(Generic[StateType]):
117
115
  checkpoint_aspect, state_class
118
116
  )
119
117
  elif checkpoint_aspect.state.serde == "base85":
120
- state_obj = Checkpoint._from_base85_bytes(
121
- checkpoint_aspect,
122
- functools.partial(bz2.decompress),
123
- state_class,
118
+ raise ValueError(
119
+ "The base85 encoding for stateful ingestion has been removed for security reasons. "
120
+ "You may need to temporarily set `ignore_previous_checkpoint` to true to ignore the outdated checkpoint object."
124
121
  )
125
122
  elif checkpoint_aspect.state.serde == "base85-bz2-json":
126
123
  state_obj = Checkpoint._from_base85_json_bytes(
@@ -164,28 +161,6 @@ class Checkpoint(Generic[StateType]):
164
161
  state_as_dict["serde"] = checkpoint_aspect.state.serde
165
162
  return state_class.parse_obj(state_as_dict)
166
163
 
167
- @staticmethod
168
- def _from_base85_bytes(
169
- checkpoint_aspect: DatahubIngestionCheckpointClass,
170
- decompressor: Callable[[bytes], bytes],
171
- state_class: Type[StateType],
172
- ) -> StateType:
173
- state: StateType = pickle.loads(
174
- decompressor(base64.b85decode(checkpoint_aspect.state.payload)) # type: ignore
175
- )
176
-
177
- with contextlib.suppress(Exception):
178
- # When loading from pickle, the pydantic validators don't run.
179
- # By re-serializing and re-parsing, we ensure that the state is valid.
180
- # However, we also suppress any exceptions to make sure this doesn't blow up.
181
- state = state_class.parse_obj(state.dict())
182
-
183
- # Because the base85 method is deprecated in favor of base85-bz2-json,
184
- # we will automatically switch the serde.
185
- state.serde = "base85-bz2-json"
186
-
187
- return state
188
-
189
164
  @staticmethod
190
165
  def _from_base85_json_bytes(
191
166
  checkpoint_aspect: DatahubIngestionCheckpointClass,