acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/METADATA +2564 -2501
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/RECORD +63 -55
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +45 -5
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/metadata/_internal_schema_classes.py +568 -512
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18242 -18168
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
@@ -239,12 +239,16 @@ class _SnowflakeTagCache:
|
|
|
239
239
|
|
|
240
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
241
241
|
def __init__(
|
|
242
|
-
self,
|
|
242
|
+
self,
|
|
243
|
+
connection: SnowflakeConnection,
|
|
244
|
+
report: SnowflakeV2Report,
|
|
245
|
+
fetch_views_from_information_schema: bool = False,
|
|
243
246
|
) -> None:
|
|
244
247
|
self.connection = connection
|
|
245
248
|
self.report = report
|
|
249
|
+
self._fetch_views_from_information_schema = fetch_views_from_information_schema
|
|
246
250
|
|
|
247
|
-
def as_obj(self) -> Dict[str,
|
|
251
|
+
def as_obj(self) -> Dict[str, Any]:
|
|
248
252
|
# TODO: Move this into a proper report type that gets computed.
|
|
249
253
|
|
|
250
254
|
# Reports how many times we reset in-memory `functools.lru_cache` caches of data,
|
|
@@ -260,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
260
264
|
self.get_fk_constraints_for_schema,
|
|
261
265
|
]
|
|
262
266
|
|
|
263
|
-
report = {
|
|
267
|
+
report: Dict[str, Any] = {
|
|
268
|
+
"fetch_views_from_information_schema": self._fetch_views_from_information_schema,
|
|
269
|
+
}
|
|
264
270
|
for func in lru_cache_functions:
|
|
265
271
|
report[func.__name__] = func.cache_info()._asdict() # type: ignore
|
|
266
272
|
return report
|
|
@@ -430,7 +436,17 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
430
436
|
return tables
|
|
431
437
|
|
|
432
438
|
@serialized_lru_cache(maxsize=1)
|
|
433
|
-
def get_views_for_database(
|
|
439
|
+
def get_views_for_database(
|
|
440
|
+
self, db_name: str
|
|
441
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
442
|
+
if self._fetch_views_from_information_schema:
|
|
443
|
+
return self._get_views_for_database_using_information_schema(db_name)
|
|
444
|
+
else:
|
|
445
|
+
return self._get_views_for_database_using_show(db_name)
|
|
446
|
+
|
|
447
|
+
def _get_views_for_database_using_show(
|
|
448
|
+
self, db_name: str
|
|
449
|
+
) -> Dict[str, List[SnowflakeView]]:
|
|
434
450
|
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
435
451
|
|
|
436
452
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
@@ -461,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
461
477
|
SnowflakeView(
|
|
462
478
|
name=view_name,
|
|
463
479
|
created=view["created_on"],
|
|
464
|
-
# last_altered=table["last_altered"],
|
|
465
480
|
comment=view["comment"],
|
|
466
481
|
view_definition=view["text"],
|
|
467
|
-
last_altered=view["created_on"],
|
|
482
|
+
last_altered=view["created_on"], # TODO: This is not correct.
|
|
468
483
|
materialized=(
|
|
469
484
|
view.get("is_materialized", "false").lower() == "true"
|
|
470
485
|
),
|
|
@@ -479,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
479
494
|
)
|
|
480
495
|
view_pagination_marker = view_name
|
|
481
496
|
|
|
497
|
+
# Because this is in a cached function, this will only log once per database.
|
|
498
|
+
view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
|
|
499
|
+
logger.info(
|
|
500
|
+
f"Finished fetching views in {db_name}; counts by schema {view_counts}"
|
|
501
|
+
)
|
|
502
|
+
return views
|
|
503
|
+
|
|
504
|
+
def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
|
|
505
|
+
schema_name = row["VIEW_SCHEMA"]
|
|
506
|
+
view_definition = row.get("VIEW_DEFINITION")
|
|
507
|
+
fragment_view_definition = (
|
|
508
|
+
view_definition[:50].strip() if view_definition else None
|
|
509
|
+
)
|
|
510
|
+
logger.info(
|
|
511
|
+
f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return schema_name, SnowflakeView(
|
|
515
|
+
name=row["VIEW_NAME"],
|
|
516
|
+
created=row["CREATED"],
|
|
517
|
+
comment=row["COMMENT"],
|
|
518
|
+
view_definition=view_definition,
|
|
519
|
+
last_altered=row["LAST_ALTERED"],
|
|
520
|
+
is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
|
|
521
|
+
# TODO: This doesn't work for materialized views.
|
|
522
|
+
materialized=False,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _maybe_populate_empty_view_definitions(
|
|
526
|
+
self,
|
|
527
|
+
db_name: str,
|
|
528
|
+
schema_name: str,
|
|
529
|
+
views_with_empty_definition: List[SnowflakeView],
|
|
530
|
+
) -> List[SnowflakeView]:
|
|
531
|
+
if not views_with_empty_definition:
|
|
532
|
+
return []
|
|
533
|
+
|
|
534
|
+
view_names = [view.name for view in views_with_empty_definition]
|
|
535
|
+
batches = [
|
|
536
|
+
batch[0]
|
|
537
|
+
for batch in build_prefix_batches(
|
|
538
|
+
view_names, max_batch_size=1000, max_groups_in_batch=1
|
|
539
|
+
)
|
|
540
|
+
if batch
|
|
541
|
+
# Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
view_map: Dict[str, SnowflakeView] = {
|
|
545
|
+
view.name: view for view in views_with_empty_definition
|
|
546
|
+
}
|
|
547
|
+
views_found_count = 0
|
|
548
|
+
|
|
549
|
+
logger.info(
|
|
550
|
+
f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
|
|
551
|
+
f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
for batch_index, prefix_group in enumerate(batches):
|
|
555
|
+
query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
|
|
556
|
+
logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
cur = self.connection.query(query)
|
|
560
|
+
for row in cur:
|
|
561
|
+
view_name = row["name"]
|
|
562
|
+
if view_name in view_map:
|
|
563
|
+
view_definition = row.get("text")
|
|
564
|
+
if view_definition: # Ensure definition is not None or empty
|
|
565
|
+
view_map[view_name].view_definition = view_definition
|
|
566
|
+
views_found_count += 1
|
|
567
|
+
logger.debug(
|
|
568
|
+
f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
|
|
569
|
+
)
|
|
570
|
+
# If all targeted views are found, we could theoretically break early,
|
|
571
|
+
# but SHOW VIEWS doesn't guarantee order, so we must process all results.
|
|
572
|
+
else:
|
|
573
|
+
logger.warning(
|
|
574
|
+
f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(
|
|
579
|
+
f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
|
|
580
|
+
exc_info=e,
|
|
581
|
+
)
|
|
582
|
+
# Returning the original list; some views might still be missing definitions.
|
|
583
|
+
# This also means subsequent batches for this schema (in this call) are skipped.
|
|
584
|
+
return views_with_empty_definition
|
|
585
|
+
|
|
586
|
+
logger.info(
|
|
587
|
+
f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
|
|
588
|
+
f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if views_found_count < len(view_map):
|
|
592
|
+
missing_count = len(view_map) - views_found_count
|
|
593
|
+
logger.warning(
|
|
594
|
+
f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
|
|
595
|
+
)
|
|
596
|
+
# The SnowflakeView objects in the original list were modified in place via view_map
|
|
597
|
+
return views_with_empty_definition
|
|
598
|
+
|
|
599
|
+
def _get_views_for_database_using_information_schema(
|
|
600
|
+
self, db_name: str
|
|
601
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
602
|
+
try:
|
|
603
|
+
cur = self.connection.query(
|
|
604
|
+
SnowflakeQuery.get_views_for_database(db_name),
|
|
605
|
+
)
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
|
|
608
|
+
# Error - Information schema query returned too much data. Please repeat query with more selective predicates.
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
views: Dict[str, List[SnowflakeView]] = {}
|
|
612
|
+
views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
|
|
613
|
+
|
|
614
|
+
for row in cur:
|
|
615
|
+
schema_name, view = self._map_view(db_name, row)
|
|
616
|
+
if view.view_definition is None or view.view_definition == "":
|
|
617
|
+
views_with_empty_definition.setdefault(schema_name, []).append(view)
|
|
618
|
+
else:
|
|
619
|
+
views.setdefault(schema_name, []).append(view)
|
|
620
|
+
|
|
621
|
+
for schema_name, empty_views in views_with_empty_definition.items():
|
|
622
|
+
updated_views = self._maybe_populate_empty_view_definitions(
|
|
623
|
+
db_name, schema_name, empty_views
|
|
624
|
+
)
|
|
625
|
+
views.setdefault(schema_name, []).extend(updated_views)
|
|
626
|
+
|
|
627
|
+
return views
|
|
628
|
+
|
|
629
|
+
def get_views_for_schema_using_information_schema(
|
|
630
|
+
self, *, schema_name: str, db_name: str
|
|
631
|
+
) -> List[SnowflakeView]:
|
|
632
|
+
cur = self.connection.query(
|
|
633
|
+
SnowflakeQuery.get_views_for_schema(
|
|
634
|
+
db_name=db_name, schema_name=schema_name
|
|
635
|
+
),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
views: List[SnowflakeView] = []
|
|
639
|
+
views_with_empty_definition: List[SnowflakeView] = []
|
|
640
|
+
|
|
641
|
+
for row in cur:
|
|
642
|
+
schema_name, view = self._map_view(db_name, row)
|
|
643
|
+
if view.view_definition is None or view.view_definition == "":
|
|
644
|
+
views_with_empty_definition.append(view)
|
|
645
|
+
else:
|
|
646
|
+
views.append(view)
|
|
647
|
+
|
|
648
|
+
if views_with_empty_definition:
|
|
649
|
+
updated_empty_views = self._maybe_populate_empty_view_definitions(
|
|
650
|
+
db_name, schema_name, views_with_empty_definition
|
|
651
|
+
)
|
|
652
|
+
views.extend(updated_empty_views)
|
|
653
|
+
|
|
482
654
|
return views
|
|
483
655
|
|
|
484
656
|
@serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
|
|
@@ -166,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
166
166
|
|
|
167
167
|
def __init__(
|
|
168
168
|
self,
|
|
169
|
-
config: SnowflakeV2Config,
|
|
170
|
-
report: SnowflakeV2Report,
|
|
169
|
+
config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
|
|
170
|
+
report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
|
|
171
171
|
connection: SnowflakeConnection,
|
|
172
172
|
filters: SnowflakeFilter,
|
|
173
173
|
identifiers: SnowflakeIdentifierBuilder,
|
|
@@ -175,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
175
175
|
profiler: Optional[SnowflakeProfiler],
|
|
176
176
|
aggregator: Optional[SqlParsingAggregator],
|
|
177
177
|
snowsight_url_builder: Optional[SnowsightUrlBuilder],
|
|
178
|
+
fetch_views_from_information_schema: bool = False,
|
|
178
179
|
) -> None:
|
|
179
180
|
self.config: SnowflakeV2Config = config
|
|
180
181
|
self.report: SnowflakeV2Report = report
|
|
@@ -183,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
183
184
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
184
185
|
|
|
185
186
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
186
|
-
connection=self.connection,
|
|
187
|
+
connection=self.connection,
|
|
188
|
+
report=self.report,
|
|
189
|
+
fetch_views_from_information_schema=fetch_views_from_information_schema,
|
|
187
190
|
)
|
|
188
191
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
189
192
|
|
|
@@ -1241,7 +1244,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1241
1244
|
# falling back to get tables for schema
|
|
1242
1245
|
if tables is None:
|
|
1243
1246
|
self.report.num_get_tables_for_schema_queries += 1
|
|
1244
|
-
return self.data_dictionary.get_tables_for_schema(
|
|
1247
|
+
return self.data_dictionary.get_tables_for_schema(
|
|
1248
|
+
db_name=db_name,
|
|
1249
|
+
schema_name=schema_name,
|
|
1250
|
+
)
|
|
1245
1251
|
|
|
1246
1252
|
# Some schema may not have any table
|
|
1247
1253
|
return tables.get(schema_name, [])
|
|
@@ -1251,8 +1257,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1251
1257
|
) -> List[SnowflakeView]:
|
|
1252
1258
|
views = self.data_dictionary.get_views_for_database(db_name)
|
|
1253
1259
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1260
|
+
if views is not None:
|
|
1261
|
+
# Some schemas may not have any views
|
|
1262
|
+
return views.get(schema_name, [])
|
|
1263
|
+
|
|
1264
|
+
# Usually this fails when there are too many views in the schema.
|
|
1265
|
+
# Fall back to per-schema queries.
|
|
1266
|
+
self.report.num_get_views_for_schema_queries += 1
|
|
1267
|
+
return self.data_dictionary.get_views_for_schema_using_information_schema(
|
|
1268
|
+
db_name=db_name,
|
|
1269
|
+
schema_name=schema_name,
|
|
1270
|
+
)
|
|
1256
1271
|
|
|
1257
1272
|
def get_columns_for_table(
|
|
1258
1273
|
self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
|
|
@@ -172,7 +172,9 @@ class SnowflakeV2Source(
|
|
|
172
172
|
|
|
173
173
|
# For database, schema, tables, views, etc
|
|
174
174
|
self.data_dictionary = SnowflakeDataDictionary(
|
|
175
|
-
connection=self.connection,
|
|
175
|
+
connection=self.connection,
|
|
176
|
+
report=self.report,
|
|
177
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
176
178
|
)
|
|
177
179
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
178
180
|
|
|
@@ -528,6 +530,7 @@ class SnowflakeV2Source(
|
|
|
528
530
|
snowsight_url_builder=snowsight_url_builder,
|
|
529
531
|
filters=self.filters,
|
|
530
532
|
identifiers=self.identifiers,
|
|
533
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
531
534
|
)
|
|
532
535
|
|
|
533
536
|
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
25
|
platform_name,
|
|
26
26
|
support_status,
|
|
27
27
|
)
|
|
28
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
29
|
+
IncrementalLineageConfigMixin,
|
|
30
|
+
auto_incremental_lineage,
|
|
31
|
+
)
|
|
28
32
|
from datahub.ingestion.api.source import (
|
|
29
33
|
MetadataWorkUnitProcessor,
|
|
30
34
|
Source,
|
|
@@ -48,7 +52,9 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
|
48
52
|
logger = logging.getLogger(__name__)
|
|
49
53
|
|
|
50
54
|
|
|
51
|
-
class SqlQueriesSourceConfig(
|
|
55
|
+
class SqlQueriesSourceConfig(
|
|
56
|
+
PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
|
|
57
|
+
):
|
|
52
58
|
query_file: str = Field(description="Path to file to ingest")
|
|
53
59
|
|
|
54
60
|
platform: str = Field(
|
|
@@ -109,6 +115,16 @@ class SqlQueriesSource(Source):
|
|
|
109
115
|
used if the query can't be parsed.
|
|
110
116
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
111
117
|
used if the query can't be parsed.
|
|
118
|
+
|
|
119
|
+
### Incremental Lineage
|
|
120
|
+
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
121
|
+
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
122
|
+
- Gradually building up lineage from multiple sources
|
|
123
|
+
- Preserving manually curated lineage
|
|
124
|
+
- Avoiding conflicts when multiple ingestion processes target the same datasets
|
|
125
|
+
|
|
126
|
+
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
127
|
+
statistics will still be emitted normally.
|
|
112
128
|
"""
|
|
113
129
|
|
|
114
130
|
schema_resolver: Optional[SchemaResolver]
|
|
@@ -165,7 +181,13 @@ class SqlQueriesSource(Source):
|
|
|
165
181
|
return self.report
|
|
166
182
|
|
|
167
183
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
168
|
-
return [
|
|
184
|
+
return [
|
|
185
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
186
|
+
partial(
|
|
187
|
+
auto_incremental_lineage,
|
|
188
|
+
self.config.incremental_lineage,
|
|
189
|
+
),
|
|
190
|
+
]
|
|
169
191
|
|
|
170
192
|
def get_workunits_internal(
|
|
171
193
|
self,
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import bz2
|
|
3
|
-
import contextlib
|
|
4
3
|
import functools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
|
-
import pickle
|
|
8
6
|
from dataclasses import dataclass
|
|
9
7
|
from datetime import datetime, timezone
|
|
10
8
|
from typing import Callable, Generic, Optional, Type, TypeVar
|
|
@@ -117,10 +115,9 @@ class Checkpoint(Generic[StateType]):
|
|
|
117
115
|
checkpoint_aspect, state_class
|
|
118
116
|
)
|
|
119
117
|
elif checkpoint_aspect.state.serde == "base85":
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
state_class,
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"The base85 encoding for stateful ingestion has been removed for security reasons. "
|
|
120
|
+
"You may need to temporarily set `ignore_previous_checkpoint` to true to ignore the outdated checkpoint object."
|
|
124
121
|
)
|
|
125
122
|
elif checkpoint_aspect.state.serde == "base85-bz2-json":
|
|
126
123
|
state_obj = Checkpoint._from_base85_json_bytes(
|
|
@@ -164,28 +161,6 @@ class Checkpoint(Generic[StateType]):
|
|
|
164
161
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
165
162
|
return state_class.parse_obj(state_as_dict)
|
|
166
163
|
|
|
167
|
-
@staticmethod
|
|
168
|
-
def _from_base85_bytes(
|
|
169
|
-
checkpoint_aspect: DatahubIngestionCheckpointClass,
|
|
170
|
-
decompressor: Callable[[bytes], bytes],
|
|
171
|
-
state_class: Type[StateType],
|
|
172
|
-
) -> StateType:
|
|
173
|
-
state: StateType = pickle.loads(
|
|
174
|
-
decompressor(base64.b85decode(checkpoint_aspect.state.payload)) # type: ignore
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
with contextlib.suppress(Exception):
|
|
178
|
-
# When loading from pickle, the pydantic validators don't run.
|
|
179
|
-
# By re-serializing and re-parsing, we ensure that the state is valid.
|
|
180
|
-
# However, we also suppress any exceptions to make sure this doesn't blow up.
|
|
181
|
-
state = state_class.parse_obj(state.dict())
|
|
182
|
-
|
|
183
|
-
# Because the base85 method is deprecated in favor of base85-bz2-json,
|
|
184
|
-
# we will automatically switch the serde.
|
|
185
|
-
state.serde = "base85-bz2-json"
|
|
186
|
-
|
|
187
|
-
return state
|
|
188
|
-
|
|
189
164
|
@staticmethod
|
|
190
165
|
def _from_base85_json_bytes(
|
|
191
166
|
checkpoint_aspect: DatahubIngestionCheckpointClass,
|