acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
from datahub.ingestion.api.report import SupportsAsObj
|
|
9
9
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
@@ -239,12 +239,16 @@ class _SnowflakeTagCache:
|
|
|
239
239
|
|
|
240
240
|
class SnowflakeDataDictionary(SupportsAsObj):
|
|
241
241
|
def __init__(
|
|
242
|
-
self,
|
|
242
|
+
self,
|
|
243
|
+
connection: SnowflakeConnection,
|
|
244
|
+
report: SnowflakeV2Report,
|
|
245
|
+
fetch_views_from_information_schema: bool = False,
|
|
243
246
|
) -> None:
|
|
244
247
|
self.connection = connection
|
|
245
248
|
self.report = report
|
|
249
|
+
self._fetch_views_from_information_schema = fetch_views_from_information_schema
|
|
246
250
|
|
|
247
|
-
def as_obj(self) -> Dict[str,
|
|
251
|
+
def as_obj(self) -> Dict[str, Any]:
|
|
248
252
|
# TODO: Move this into a proper report type that gets computed.
|
|
249
253
|
|
|
250
254
|
# Reports how many times we reset in-memory `functools.lru_cache` caches of data,
|
|
@@ -260,7 +264,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
260
264
|
self.get_fk_constraints_for_schema,
|
|
261
265
|
]
|
|
262
266
|
|
|
263
|
-
report = {
|
|
267
|
+
report: Dict[str, Any] = {
|
|
268
|
+
"fetch_views_from_information_schema": self._fetch_views_from_information_schema,
|
|
269
|
+
}
|
|
264
270
|
for func in lru_cache_functions:
|
|
265
271
|
report[func.__name__] = func.cache_info()._asdict() # type: ignore
|
|
266
272
|
return report
|
|
@@ -430,7 +436,17 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
430
436
|
return tables
|
|
431
437
|
|
|
432
438
|
@serialized_lru_cache(maxsize=1)
|
|
433
|
-
def get_views_for_database(
|
|
439
|
+
def get_views_for_database(
|
|
440
|
+
self, db_name: str
|
|
441
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
442
|
+
if self._fetch_views_from_information_schema:
|
|
443
|
+
return self._get_views_for_database_using_information_schema(db_name)
|
|
444
|
+
else:
|
|
445
|
+
return self._get_views_for_database_using_show(db_name)
|
|
446
|
+
|
|
447
|
+
def _get_views_for_database_using_show(
|
|
448
|
+
self, db_name: str
|
|
449
|
+
) -> Dict[str, List[SnowflakeView]]:
|
|
434
450
|
page_limit = SHOW_COMMAND_MAX_PAGE_SIZE
|
|
435
451
|
|
|
436
452
|
views: Dict[str, List[SnowflakeView]] = {}
|
|
@@ -461,10 +477,9 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
461
477
|
SnowflakeView(
|
|
462
478
|
name=view_name,
|
|
463
479
|
created=view["created_on"],
|
|
464
|
-
# last_altered=table["last_altered"],
|
|
465
480
|
comment=view["comment"],
|
|
466
481
|
view_definition=view["text"],
|
|
467
|
-
last_altered=view["created_on"],
|
|
482
|
+
last_altered=view["created_on"], # TODO: This is not correct.
|
|
468
483
|
materialized=(
|
|
469
484
|
view.get("is_materialized", "false").lower() == "true"
|
|
470
485
|
),
|
|
@@ -479,6 +494,163 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
479
494
|
)
|
|
480
495
|
view_pagination_marker = view_name
|
|
481
496
|
|
|
497
|
+
# Because this is in a cached function, this will only log once per database.
|
|
498
|
+
view_counts = {schema_name: len(views[schema_name]) for schema_name in views}
|
|
499
|
+
logger.info(
|
|
500
|
+
f"Finished fetching views in {db_name}; counts by schema {view_counts}"
|
|
501
|
+
)
|
|
502
|
+
return views
|
|
503
|
+
|
|
504
|
+
def _map_view(self, db_name: str, row: Dict[str, Any]) -> Tuple[str, SnowflakeView]:
|
|
505
|
+
schema_name = row["VIEW_SCHEMA"]
|
|
506
|
+
view_definition = row.get("VIEW_DEFINITION")
|
|
507
|
+
fragment_view_definition = (
|
|
508
|
+
view_definition[:50].strip() if view_definition else None
|
|
509
|
+
)
|
|
510
|
+
logger.info(
|
|
511
|
+
f"Mapping view {db_name}.{schema_name}.{row['VIEW_NAME']} with view definition: {fragment_view_definition}..."
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return schema_name, SnowflakeView(
|
|
515
|
+
name=row["VIEW_NAME"],
|
|
516
|
+
created=row["CREATED"],
|
|
517
|
+
comment=row["COMMENT"],
|
|
518
|
+
view_definition=view_definition,
|
|
519
|
+
last_altered=row["LAST_ALTERED"],
|
|
520
|
+
is_secure=(row.get("IS_SECURE", "false").lower() == "true"),
|
|
521
|
+
# TODO: This doesn't work for materialized views.
|
|
522
|
+
materialized=False,
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _maybe_populate_empty_view_definitions(
|
|
526
|
+
self,
|
|
527
|
+
db_name: str,
|
|
528
|
+
schema_name: str,
|
|
529
|
+
views_with_empty_definition: List[SnowflakeView],
|
|
530
|
+
) -> List[SnowflakeView]:
|
|
531
|
+
if not views_with_empty_definition:
|
|
532
|
+
return []
|
|
533
|
+
|
|
534
|
+
view_names = [view.name for view in views_with_empty_definition]
|
|
535
|
+
batches = [
|
|
536
|
+
batch[0]
|
|
537
|
+
for batch in build_prefix_batches(
|
|
538
|
+
view_names, max_batch_size=1000, max_groups_in_batch=1
|
|
539
|
+
)
|
|
540
|
+
if batch
|
|
541
|
+
# Skip empty batch if so, also max_groups_in_batch=1 makes it safe to access batch[0]
|
|
542
|
+
]
|
|
543
|
+
|
|
544
|
+
view_map: Dict[str, SnowflakeView] = {
|
|
545
|
+
view.name: view for view in views_with_empty_definition
|
|
546
|
+
}
|
|
547
|
+
views_found_count = 0
|
|
548
|
+
|
|
549
|
+
logger.info(
|
|
550
|
+
f"Fetching definitions for {len(view_map)} views in {db_name}.{schema_name} "
|
|
551
|
+
f"using batched 'SHOW VIEWS ... LIKE ...' queries. Found {len(batches)} batch(es)."
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
for batch_index, prefix_group in enumerate(batches):
|
|
555
|
+
query = f'SHOW VIEWS LIKE \'{prefix_group.prefix}%\' IN SCHEMA "{db_name}"."{schema_name}"'
|
|
556
|
+
logger.info(f"Processing batch {batch_index + 1}/{len(batches)}: {query}")
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
cur = self.connection.query(query)
|
|
560
|
+
for row in cur:
|
|
561
|
+
view_name = row["name"]
|
|
562
|
+
if view_name in view_map:
|
|
563
|
+
view_definition = row.get("text")
|
|
564
|
+
if view_definition: # Ensure definition is not None or empty
|
|
565
|
+
view_map[view_name].view_definition = view_definition
|
|
566
|
+
views_found_count += 1
|
|
567
|
+
logger.debug(
|
|
568
|
+
f"Fetched view definition for {db_name}.{schema_name}.{view_name}"
|
|
569
|
+
)
|
|
570
|
+
# If all targeted views are found, we could theoretically break early,
|
|
571
|
+
# but SHOW VIEWS doesn't guarantee order, so we must process all results.
|
|
572
|
+
else:
|
|
573
|
+
logger.warning(
|
|
574
|
+
f"'text' field missing or empty in SHOW VIEWS result for {db_name}.{schema_name}.{view_name}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(
|
|
579
|
+
f"Failed to execute query for batch {batch_index + 1} ('{query}') for {db_name}.{schema_name} or process its results.",
|
|
580
|
+
exc_info=e,
|
|
581
|
+
)
|
|
582
|
+
# Returning the original list; some views might still be missing definitions.
|
|
583
|
+
# This also means subsequent batches for this schema (in this call) are skipped.
|
|
584
|
+
return views_with_empty_definition
|
|
585
|
+
|
|
586
|
+
logger.info(
|
|
587
|
+
f"Finished processing 'SHOW VIEWS' batches for {db_name}.{schema_name}. "
|
|
588
|
+
f"Fetched definitions for {views_found_count} out of {len(view_map)} targeted views."
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
if views_found_count < len(view_map):
|
|
592
|
+
missing_count = len(view_map) - views_found_count
|
|
593
|
+
logger.warning(
|
|
594
|
+
f"Could not fetch definitions for {missing_count} views in {db_name}.{schema_name} after processing all batches."
|
|
595
|
+
)
|
|
596
|
+
# The SnowflakeView objects in the original list were modified in place via view_map
|
|
597
|
+
return views_with_empty_definition
|
|
598
|
+
|
|
599
|
+
def _get_views_for_database_using_information_schema(
|
|
600
|
+
self, db_name: str
|
|
601
|
+
) -> Optional[Dict[str, List[SnowflakeView]]]:
|
|
602
|
+
try:
|
|
603
|
+
cur = self.connection.query(
|
|
604
|
+
SnowflakeQuery.get_views_for_database(db_name),
|
|
605
|
+
)
|
|
606
|
+
except Exception as e:
|
|
607
|
+
logger.debug(f"Failed to get all views for database {db_name}", exc_info=e)
|
|
608
|
+
# Error - Information schema query returned too much data. Please repeat query with more selective predicates.
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
views: Dict[str, List[SnowflakeView]] = {}
|
|
612
|
+
views_with_empty_definition: Dict[str, List[SnowflakeView]] = {}
|
|
613
|
+
|
|
614
|
+
for row in cur:
|
|
615
|
+
schema_name, view = self._map_view(db_name, row)
|
|
616
|
+
if view.view_definition is None or view.view_definition == "":
|
|
617
|
+
views_with_empty_definition.setdefault(schema_name, []).append(view)
|
|
618
|
+
else:
|
|
619
|
+
views.setdefault(schema_name, []).append(view)
|
|
620
|
+
|
|
621
|
+
for schema_name, empty_views in views_with_empty_definition.items():
|
|
622
|
+
updated_views = self._maybe_populate_empty_view_definitions(
|
|
623
|
+
db_name, schema_name, empty_views
|
|
624
|
+
)
|
|
625
|
+
views.setdefault(schema_name, []).extend(updated_views)
|
|
626
|
+
|
|
627
|
+
return views
|
|
628
|
+
|
|
629
|
+
def get_views_for_schema_using_information_schema(
|
|
630
|
+
self, *, schema_name: str, db_name: str
|
|
631
|
+
) -> List[SnowflakeView]:
|
|
632
|
+
cur = self.connection.query(
|
|
633
|
+
SnowflakeQuery.get_views_for_schema(
|
|
634
|
+
db_name=db_name, schema_name=schema_name
|
|
635
|
+
),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
views: List[SnowflakeView] = []
|
|
639
|
+
views_with_empty_definition: List[SnowflakeView] = []
|
|
640
|
+
|
|
641
|
+
for row in cur:
|
|
642
|
+
schema_name, view = self._map_view(db_name, row)
|
|
643
|
+
if view.view_definition is None or view.view_definition == "":
|
|
644
|
+
views_with_empty_definition.append(view)
|
|
645
|
+
else:
|
|
646
|
+
views.append(view)
|
|
647
|
+
|
|
648
|
+
if views_with_empty_definition:
|
|
649
|
+
updated_empty_views = self._maybe_populate_empty_view_definitions(
|
|
650
|
+
db_name, schema_name, views_with_empty_definition
|
|
651
|
+
)
|
|
652
|
+
views.extend(updated_empty_views)
|
|
653
|
+
|
|
482
654
|
return views
|
|
483
655
|
|
|
484
656
|
@serialized_lru_cache(maxsize=SCHEMA_PARALLELISM)
|
|
@@ -166,8 +166,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
166
166
|
|
|
167
167
|
def __init__(
|
|
168
168
|
self,
|
|
169
|
-
config: SnowflakeV2Config,
|
|
170
|
-
report: SnowflakeV2Report,
|
|
169
|
+
config: SnowflakeV2Config, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryConfig
|
|
170
|
+
report: SnowflakeV2Report, # FIXME: SnowflakeSummary is passing here SnowflakeSummaryReport
|
|
171
171
|
connection: SnowflakeConnection,
|
|
172
172
|
filters: SnowflakeFilter,
|
|
173
173
|
identifiers: SnowflakeIdentifierBuilder,
|
|
@@ -175,6 +175,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
175
175
|
profiler: Optional[SnowflakeProfiler],
|
|
176
176
|
aggregator: Optional[SqlParsingAggregator],
|
|
177
177
|
snowsight_url_builder: Optional[SnowsightUrlBuilder],
|
|
178
|
+
fetch_views_from_information_schema: bool = False,
|
|
178
179
|
) -> None:
|
|
179
180
|
self.config: SnowflakeV2Config = config
|
|
180
181
|
self.report: SnowflakeV2Report = report
|
|
@@ -183,7 +184,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
183
184
|
self.identifiers: SnowflakeIdentifierBuilder = identifiers
|
|
184
185
|
|
|
185
186
|
self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary(
|
|
186
|
-
connection=self.connection,
|
|
187
|
+
connection=self.connection,
|
|
188
|
+
report=self.report,
|
|
189
|
+
fetch_views_from_information_schema=fetch_views_from_information_schema,
|
|
187
190
|
)
|
|
188
191
|
self.report.data_dictionary_cache = self.data_dictionary
|
|
189
192
|
|
|
@@ -438,13 +441,16 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
438
441
|
tables = self.fetch_tables_for_schema(
|
|
439
442
|
snowflake_schema, db_name, schema_name
|
|
440
443
|
)
|
|
444
|
+
if self.config.include_views:
|
|
445
|
+
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
446
|
+
|
|
447
|
+
if self.config.include_tables:
|
|
441
448
|
db_tables[schema_name] = tables
|
|
442
449
|
yield from self._process_tables(
|
|
443
450
|
tables, snowflake_schema, db_name, schema_name
|
|
444
451
|
)
|
|
445
452
|
|
|
446
453
|
if self.config.include_views:
|
|
447
|
-
views = self.fetch_views_for_schema(snowflake_schema, db_name, schema_name)
|
|
448
454
|
yield from self._process_views(
|
|
449
455
|
views, snowflake_schema, db_name, schema_name
|
|
450
456
|
)
|
|
@@ -1241,7 +1247,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1241
1247
|
# falling back to get tables for schema
|
|
1242
1248
|
if tables is None:
|
|
1243
1249
|
self.report.num_get_tables_for_schema_queries += 1
|
|
1244
|
-
return self.data_dictionary.get_tables_for_schema(
|
|
1250
|
+
return self.data_dictionary.get_tables_for_schema(
|
|
1251
|
+
db_name=db_name,
|
|
1252
|
+
schema_name=schema_name,
|
|
1253
|
+
)
|
|
1245
1254
|
|
|
1246
1255
|
# Some schema may not have any table
|
|
1247
1256
|
return tables.get(schema_name, [])
|
|
@@ -1251,8 +1260,17 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1251
1260
|
) -> List[SnowflakeView]:
|
|
1252
1261
|
views = self.data_dictionary.get_views_for_database(db_name)
|
|
1253
1262
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1263
|
+
if views is not None:
|
|
1264
|
+
# Some schemas may not have any views
|
|
1265
|
+
return views.get(schema_name, [])
|
|
1266
|
+
|
|
1267
|
+
# Usually this fails when there are too many views in the schema.
|
|
1268
|
+
# Fall back to per-schema queries.
|
|
1269
|
+
self.report.num_get_views_for_schema_queries += 1
|
|
1270
|
+
return self.data_dictionary.get_views_for_schema_using_information_schema(
|
|
1271
|
+
db_name=db_name,
|
|
1272
|
+
schema_name=schema_name,
|
|
1273
|
+
)
|
|
1256
1274
|
|
|
1257
1275
|
def get_columns_for_table(
|
|
1258
1276
|
self, table_name: str, snowflake_schema: SnowflakeSchema, db_name: str
|
|
@@ -9,6 +9,7 @@ from datahub.emitter.mce_builder import (
|
|
|
9
9
|
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
10
10
|
from datahub.ingestion.api.source import SourceReport
|
|
11
11
|
from datahub.ingestion.source.snowflake.constants import (
|
|
12
|
+
DEFAULT_SNOWFLAKE_DOMAIN,
|
|
12
13
|
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
|
|
13
14
|
SnowflakeCloudProvider,
|
|
14
15
|
SnowflakeObjectDomain,
|
|
@@ -34,16 +35,21 @@ class SnowsightUrlBuilder:
|
|
|
34
35
|
"us-east-1",
|
|
35
36
|
"eu-west-1",
|
|
36
37
|
"eu-central-1",
|
|
37
|
-
"ap-southeast-1",
|
|
38
38
|
"ap-southeast-2",
|
|
39
39
|
]
|
|
40
40
|
|
|
41
41
|
snowsight_base_url: str
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
account_locator: str,
|
|
46
|
+
region: str,
|
|
47
|
+
privatelink: bool = False,
|
|
48
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
49
|
+
):
|
|
44
50
|
cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region)
|
|
45
51
|
self.snowsight_base_url = self.create_snowsight_base_url(
|
|
46
|
-
account_locator, cloud_region_id, cloud, privatelink
|
|
52
|
+
account_locator, cloud_region_id, cloud, privatelink, snowflake_domain
|
|
47
53
|
)
|
|
48
54
|
|
|
49
55
|
@staticmethod
|
|
@@ -52,6 +58,7 @@ class SnowsightUrlBuilder:
|
|
|
52
58
|
cloud_region_id: str,
|
|
53
59
|
cloud: str,
|
|
54
60
|
privatelink: bool = False,
|
|
61
|
+
snowflake_domain: str = DEFAULT_SNOWFLAKE_DOMAIN,
|
|
55
62
|
) -> str:
|
|
56
63
|
if cloud:
|
|
57
64
|
url_cloud_provider_suffix = f".{cloud}"
|
|
@@ -67,9 +74,15 @@ class SnowsightUrlBuilder:
|
|
|
67
74
|
else:
|
|
68
75
|
url_cloud_provider_suffix = f".{cloud}"
|
|
69
76
|
if privatelink:
|
|
70
|
-
url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.
|
|
77
|
+
url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.{snowflake_domain}/"
|
|
71
78
|
else:
|
|
72
|
-
|
|
79
|
+
# Standard Snowsight URL format - works for most regions
|
|
80
|
+
# China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
|
|
81
|
+
# guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
|
|
82
|
+
if snowflake_domain == "snowflakecomputing.cn":
|
|
83
|
+
url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
84
|
+
else:
|
|
85
|
+
url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
73
86
|
return url
|
|
74
87
|
|
|
75
88
|
@staticmethod
|
|
@@ -172,7 +172,9 @@ class SnowflakeV2Source(
|
|
|
172
172
|
|
|
173
173
|
# For database, schema, tables, views, etc
|
|
174
174
|
self.data_dictionary = SnowflakeDataDictionary(
|
|
175
|
-
connection=self.connection,
|
|
175
|
+
connection=self.connection,
|
|
176
|
+
report=self.report,
|
|
177
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
176
178
|
)
|
|
177
179
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
178
180
|
|
|
@@ -197,6 +199,7 @@ class SnowflakeV2Source(
|
|
|
197
199
|
),
|
|
198
200
|
generate_usage_statistics=False,
|
|
199
201
|
generate_operations=False,
|
|
202
|
+
generate_queries=self.config.include_queries,
|
|
200
203
|
format_queries=self.config.format_sql_queries,
|
|
201
204
|
is_temp_table=self._is_temp_table,
|
|
202
205
|
is_allowed_table=self._is_allowed_table,
|
|
@@ -528,6 +531,7 @@ class SnowflakeV2Source(
|
|
|
528
531
|
snowsight_url_builder=snowsight_url_builder,
|
|
529
532
|
filters=self.filters,
|
|
530
533
|
identifiers=self.identifiers,
|
|
534
|
+
fetch_views_from_information_schema=self.config.fetch_views_from_information_schema,
|
|
531
535
|
)
|
|
532
536
|
|
|
533
537
|
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
@@ -747,6 +751,7 @@ class SnowflakeV2Source(
|
|
|
747
751
|
# For privatelink, account identifier ends with .privatelink
|
|
748
752
|
# See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls
|
|
749
753
|
privatelink=self.config.account_id.endswith(".privatelink"),
|
|
754
|
+
snowflake_domain=self.config.snowflake_domain,
|
|
750
755
|
)
|
|
751
756
|
|
|
752
757
|
except Exception as e:
|
|
@@ -134,7 +134,9 @@ class StoredProcedure:
|
|
|
134
134
|
|
|
135
135
|
@property
|
|
136
136
|
def escape_full_name(self) -> str:
|
|
137
|
-
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
|
|
137
|
+
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]".replace(
|
|
138
|
+
"'", r"''"
|
|
139
|
+
)
|
|
138
140
|
|
|
139
141
|
def to_base_procedure(self) -> BaseProcedure:
|
|
140
142
|
return BaseProcedure(
|
|
@@ -10,6 +10,7 @@ from sqlalchemy import create_engine, inspect
|
|
|
10
10
|
from sqlalchemy.engine.base import Connection
|
|
11
11
|
from sqlalchemy.engine.reflection import Inspector
|
|
12
12
|
from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
13
|
+
from sqlalchemy.sql import quoted_name
|
|
13
14
|
|
|
14
15
|
import datahub.metadata.schema_classes as models
|
|
15
16
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -130,10 +131,14 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
130
131
|
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
131
132
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
132
133
|
)
|
|
134
|
+
quote_schemas: bool = Field(
|
|
135
|
+
default=False,
|
|
136
|
+
description="Represent a schema identifiers combined with quoting preferences. See [sqlalchemy quoted_name docs](https://docs.sqlalchemy.org/en/20/core/sqlelement.html#sqlalchemy.sql.expression.quoted_name).",
|
|
137
|
+
)
|
|
133
138
|
|
|
134
139
|
@pydantic.validator("uri_args")
|
|
135
140
|
def passwords_match(cls, v, values, **kwargs):
|
|
136
|
-
if values["use_odbc"] and "driver" not in v:
|
|
141
|
+
if values["use_odbc"] and not values["sqlalchemy_uri"] and "driver" not in v:
|
|
137
142
|
raise ValueError("uri_args must contain a 'driver' option")
|
|
138
143
|
elif not values["use_odbc"] and v:
|
|
139
144
|
raise ValueError("uri_args is not supported when ODBC is disabled")
|
|
@@ -159,7 +164,15 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
159
164
|
uri_opts=uri_opts,
|
|
160
165
|
)
|
|
161
166
|
if self.use_odbc:
|
|
162
|
-
|
|
167
|
+
final_uri_args = self.uri_args.copy()
|
|
168
|
+
if final_uri_args and current_db:
|
|
169
|
+
final_uri_args.update({"database": current_db})
|
|
170
|
+
|
|
171
|
+
uri = (
|
|
172
|
+
f"{uri}?{urllib.parse.urlencode(final_uri_args)}"
|
|
173
|
+
if final_uri_args
|
|
174
|
+
else uri
|
|
175
|
+
)
|
|
163
176
|
return uri
|
|
164
177
|
|
|
165
178
|
@property
|
|
@@ -923,7 +936,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
923
936
|
logger.debug(f"sql_alchemy_url={url}")
|
|
924
937
|
engine = create_engine(url, **self.config.options)
|
|
925
938
|
|
|
926
|
-
if
|
|
939
|
+
if (
|
|
940
|
+
self.config.database
|
|
941
|
+
and self.config.database != ""
|
|
942
|
+
or (self.config.sqlalchemy_uri and self.config.sqlalchemy_uri != "")
|
|
943
|
+
):
|
|
927
944
|
inspector = inspect(engine)
|
|
928
945
|
yield inspector
|
|
929
946
|
else:
|
|
@@ -1020,3 +1037,45 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
1020
1037
|
if self.config.convert_urns_to_lowercase
|
|
1021
1038
|
else table_ref_str
|
|
1022
1039
|
)
|
|
1040
|
+
|
|
1041
|
+
def get_allowed_schemas(self, inspector: Inspector, db_name: str) -> Iterable[str]:
|
|
1042
|
+
for schema in super().get_allowed_schemas(inspector, db_name):
|
|
1043
|
+
if self.config.quote_schemas:
|
|
1044
|
+
yield quoted_name(schema, True)
|
|
1045
|
+
else:
|
|
1046
|
+
yield schema
|
|
1047
|
+
|
|
1048
|
+
def get_db_name(self, inspector: Inspector) -> str:
|
|
1049
|
+
engine = inspector.engine
|
|
1050
|
+
|
|
1051
|
+
try:
|
|
1052
|
+
if (
|
|
1053
|
+
engine
|
|
1054
|
+
and hasattr(engine, "url")
|
|
1055
|
+
and hasattr(engine.url, "database")
|
|
1056
|
+
and engine.url.database
|
|
1057
|
+
):
|
|
1058
|
+
return str(engine.url.database).strip('"')
|
|
1059
|
+
|
|
1060
|
+
if (
|
|
1061
|
+
engine
|
|
1062
|
+
and hasattr(engine, "url")
|
|
1063
|
+
and hasattr(engine.url, "query")
|
|
1064
|
+
and "odbc_connect" in engine.url.query
|
|
1065
|
+
):
|
|
1066
|
+
# According to the ODBC connection keywords: https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver17#supported-dsnconnection-string-keywords-and-connection-attributes
|
|
1067
|
+
database = re.search(
|
|
1068
|
+
r"DATABASE=([^;]*);",
|
|
1069
|
+
urllib.parse.unquote_plus(str(engine.url.query["odbc_connect"])),
|
|
1070
|
+
flags=re.IGNORECASE,
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
if database and database.group(1):
|
|
1074
|
+
return database.group(1)
|
|
1075
|
+
|
|
1076
|
+
return ""
|
|
1077
|
+
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
raise RuntimeError(
|
|
1080
|
+
"Unable to get database name from Sqlalchemy inspector"
|
|
1081
|
+
) from e
|
|
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
25
25
|
platform_name,
|
|
26
26
|
support_status,
|
|
27
27
|
)
|
|
28
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
29
|
+
IncrementalLineageConfigMixin,
|
|
30
|
+
auto_incremental_lineage,
|
|
31
|
+
)
|
|
28
32
|
from datahub.ingestion.api.source import (
|
|
29
33
|
MetadataWorkUnitProcessor,
|
|
30
34
|
Source,
|
|
@@ -48,7 +52,9 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
|
48
52
|
logger = logging.getLogger(__name__)
|
|
49
53
|
|
|
50
54
|
|
|
51
|
-
class SqlQueriesSourceConfig(
|
|
55
|
+
class SqlQueriesSourceConfig(
|
|
56
|
+
PlatformInstanceConfigMixin, EnvConfigMixin, IncrementalLineageConfigMixin
|
|
57
|
+
):
|
|
52
58
|
query_file: str = Field(description="Path to file to ingest")
|
|
53
59
|
|
|
54
60
|
platform: str = Field(
|
|
@@ -109,6 +115,16 @@ class SqlQueriesSource(Source):
|
|
|
109
115
|
used if the query can't be parsed.
|
|
110
116
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
111
117
|
used if the query can't be parsed.
|
|
118
|
+
|
|
119
|
+
### Incremental Lineage
|
|
120
|
+
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
121
|
+
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
122
|
+
- Gradually building up lineage from multiple sources
|
|
123
|
+
- Preserving manually curated lineage
|
|
124
|
+
- Avoiding conflicts when multiple ingestion processes target the same datasets
|
|
125
|
+
|
|
126
|
+
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
127
|
+
statistics will still be emitted normally.
|
|
112
128
|
"""
|
|
113
129
|
|
|
114
130
|
schema_resolver: Optional[SchemaResolver]
|
|
@@ -165,7 +181,13 @@ class SqlQueriesSource(Source):
|
|
|
165
181
|
return self.report
|
|
166
182
|
|
|
167
183
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
168
|
-
return [
|
|
184
|
+
return [
|
|
185
|
+
partial(auto_workunit_reporter, self.get_report()),
|
|
186
|
+
partial(
|
|
187
|
+
auto_incremental_lineage,
|
|
188
|
+
self.config.incremental_lineage,
|
|
189
|
+
),
|
|
190
|
+
]
|
|
169
191
|
|
|
170
192
|
def get_workunits_internal(
|
|
171
193
|
self,
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import bz2
|
|
3
|
-
import contextlib
|
|
4
3
|
import functools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
|
-
import pickle
|
|
8
6
|
from dataclasses import dataclass
|
|
9
7
|
from datetime import datetime, timezone
|
|
10
8
|
from typing import Callable, Generic, Optional, Type, TypeVar
|
|
@@ -117,10 +115,9 @@ class Checkpoint(Generic[StateType]):
|
|
|
117
115
|
checkpoint_aspect, state_class
|
|
118
116
|
)
|
|
119
117
|
elif checkpoint_aspect.state.serde == "base85":
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
state_class,
|
|
118
|
+
raise ValueError(
|
|
119
|
+
"The base85 encoding for stateful ingestion has been removed for security reasons. "
|
|
120
|
+
"You may need to temporarily set `ignore_previous_checkpoint` to true to ignore the outdated checkpoint object."
|
|
124
121
|
)
|
|
125
122
|
elif checkpoint_aspect.state.serde == "base85-bz2-json":
|
|
126
123
|
state_obj = Checkpoint._from_base85_json_bytes(
|
|
@@ -164,28 +161,6 @@ class Checkpoint(Generic[StateType]):
|
|
|
164
161
|
state_as_dict["serde"] = checkpoint_aspect.state.serde
|
|
165
162
|
return state_class.parse_obj(state_as_dict)
|
|
166
163
|
|
|
167
|
-
@staticmethod
|
|
168
|
-
def _from_base85_bytes(
|
|
169
|
-
checkpoint_aspect: DatahubIngestionCheckpointClass,
|
|
170
|
-
decompressor: Callable[[bytes], bytes],
|
|
171
|
-
state_class: Type[StateType],
|
|
172
|
-
) -> StateType:
|
|
173
|
-
state: StateType = pickle.loads(
|
|
174
|
-
decompressor(base64.b85decode(checkpoint_aspect.state.payload)) # type: ignore
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
with contextlib.suppress(Exception):
|
|
178
|
-
# When loading from pickle, the pydantic validators don't run.
|
|
179
|
-
# By re-serializing and re-parsing, we ensure that the state is valid.
|
|
180
|
-
# However, we also suppress any exceptions to make sure this doesn't blow up.
|
|
181
|
-
state = state_class.parse_obj(state.dict())
|
|
182
|
-
|
|
183
|
-
# Because the base85 method is deprecated in favor of base85-bz2-json,
|
|
184
|
-
# we will automatically switch the serde.
|
|
185
|
-
state.serde = "base85-bz2-json"
|
|
186
|
-
|
|
187
|
-
return state
|
|
188
|
-
|
|
189
164
|
@staticmethod
|
|
190
165
|
def _from_base85_json_bytes(
|
|
191
166
|
checkpoint_aspect: DatahubIngestionCheckpointClass,
|