acryl-datahub 0.15.0.5rc3__py3-none-any.whl → 0.15.0.5rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/METADATA +2480 -2480
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/RECORD +15 -15
- datahub/_version.py +1 -1
- datahub/ingestion/source/aws/glue.py +2 -0
- datahub/ingestion/source/looker/looker_config.py +3 -1
- datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
- datahub/ingestion/source/looker/looker_file_loader.py +14 -3
- datahub/ingestion/source/looker/looker_template_language.py +104 -14
- datahub/ingestion/source/looker/lookml_config.py +20 -3
- datahub/ingestion/source/looker/lookml_source.py +110 -22
- datahub/ingestion/source/snowflake/snowflake_v2.py +41 -4
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/top_level.txt +0 -0
|
@@ -43,6 +43,7 @@ from datahub.ingestion.source.looker.looker_common import (
|
|
|
43
43
|
from datahub.ingestion.source.looker.looker_connection import (
|
|
44
44
|
get_connection_def_based_on_connection_string,
|
|
45
45
|
)
|
|
46
|
+
from datahub.ingestion.source.looker.looker_dataclasses import LookerConstant
|
|
46
47
|
from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
|
|
47
48
|
from datahub.ingestion.source.looker.looker_template_language import (
|
|
48
49
|
load_and_preprocess_file,
|
|
@@ -59,6 +60,7 @@ from datahub.ingestion.source.looker.lookml_concept_context import (
|
|
|
59
60
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
60
61
|
BASE_PROJECT_NAME,
|
|
61
62
|
MODEL_FILE_EXTENSION,
|
|
63
|
+
VIEW_FILE_EXTENSION,
|
|
62
64
|
LookerConnectionDefinition,
|
|
63
65
|
LookMLSourceConfig,
|
|
64
66
|
LookMLSourceReport,
|
|
@@ -253,6 +255,7 @@ class LookerManifest:
|
|
|
253
255
|
# This must be set if the manifest has local_dependency entries.
|
|
254
256
|
# See https://cloud.google.com/looker/docs/reference/param-manifest-project-name
|
|
255
257
|
project_name: Optional[str]
|
|
258
|
+
constants: Optional[List[Dict[str, str]]]
|
|
256
259
|
|
|
257
260
|
local_dependencies: List[str]
|
|
258
261
|
remote_dependencies: List[LookerRemoteDependency]
|
|
@@ -309,11 +312,14 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
309
312
|
"manage_models permission enabled on this API key."
|
|
310
313
|
) from err
|
|
311
314
|
|
|
315
|
+
self.manifest_constants: Dict[str, "LookerConstant"] = {}
|
|
316
|
+
|
|
312
317
|
def _load_model(self, path: str) -> LookerModel:
|
|
313
318
|
logger.debug(f"Loading model from file {path}")
|
|
314
319
|
|
|
315
320
|
parsed = load_and_preprocess_file(
|
|
316
321
|
path=path,
|
|
322
|
+
reporter=self.reporter,
|
|
317
323
|
source_config=self.source_config,
|
|
318
324
|
)
|
|
319
325
|
|
|
@@ -499,27 +505,33 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
499
505
|
|
|
500
506
|
def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
|
|
501
507
|
manifest_file = folder / "manifest.lkml"
|
|
502
|
-
if manifest_file.exists():
|
|
503
|
-
manifest_dict = load_and_preprocess_file(
|
|
504
|
-
path=manifest_file, source_config=self.source_config
|
|
505
|
-
)
|
|
506
508
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
],
|
|
512
|
-
remote_dependencies=[
|
|
513
|
-
LookerRemoteDependency(
|
|
514
|
-
name=x["name"], url=x["url"], ref=x.get("ref")
|
|
515
|
-
)
|
|
516
|
-
for x in manifest_dict.get("remote_dependencys", [])
|
|
517
|
-
],
|
|
509
|
+
if not manifest_file.exists():
|
|
510
|
+
self.reporter.info(
|
|
511
|
+
message="manifest.lkml file missing from project",
|
|
512
|
+
context=str(manifest_file),
|
|
518
513
|
)
|
|
519
|
-
return manifest
|
|
520
|
-
else:
|
|
521
514
|
return None
|
|
522
515
|
|
|
516
|
+
manifest_dict = load_and_preprocess_file(
|
|
517
|
+
path=manifest_file,
|
|
518
|
+
source_config=self.source_config,
|
|
519
|
+
reporter=self.reporter,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
manifest = LookerManifest(
|
|
523
|
+
project_name=manifest_dict.get("project_name"),
|
|
524
|
+
constants=manifest_dict.get("constants", []),
|
|
525
|
+
local_dependencies=[
|
|
526
|
+
x["project"] for x in manifest_dict.get("local_dependencys", [])
|
|
527
|
+
],
|
|
528
|
+
remote_dependencies=[
|
|
529
|
+
LookerRemoteDependency(name=x["name"], url=x["url"], ref=x.get("ref"))
|
|
530
|
+
for x in manifest_dict.get("remote_dependencys", [])
|
|
531
|
+
],
|
|
532
|
+
)
|
|
533
|
+
return manifest
|
|
534
|
+
|
|
523
535
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
524
536
|
return [
|
|
525
537
|
*super().get_workunit_processors(),
|
|
@@ -574,7 +586,10 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
574
586
|
self.base_projects_folder[project] = p_ref
|
|
575
587
|
|
|
576
588
|
self._recursively_check_manifests(
|
|
577
|
-
tmp_dir,
|
|
589
|
+
tmp_dir,
|
|
590
|
+
BASE_PROJECT_NAME,
|
|
591
|
+
visited_projects,
|
|
592
|
+
self.manifest_constants,
|
|
578
593
|
)
|
|
579
594
|
|
|
580
595
|
yield from self.get_internal_workunits()
|
|
@@ -587,7 +602,11 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
587
602
|
)
|
|
588
603
|
|
|
589
604
|
def _recursively_check_manifests(
|
|
590
|
-
self,
|
|
605
|
+
self,
|
|
606
|
+
tmp_dir: str,
|
|
607
|
+
project_name: str,
|
|
608
|
+
project_visited: Set[str],
|
|
609
|
+
manifest_constants: Dict[str, "LookerConstant"],
|
|
591
610
|
) -> None:
|
|
592
611
|
if project_name in project_visited:
|
|
593
612
|
return
|
|
@@ -604,6 +623,14 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
604
623
|
if not manifest:
|
|
605
624
|
return
|
|
606
625
|
|
|
626
|
+
if manifest.constants:
|
|
627
|
+
for constant in manifest.constants:
|
|
628
|
+
if constant.get("name") and constant.get("value"):
|
|
629
|
+
manifest_constants[constant["name"]] = LookerConstant(
|
|
630
|
+
name=constant["name"],
|
|
631
|
+
value=constant["value"],
|
|
632
|
+
)
|
|
633
|
+
|
|
607
634
|
# Special case handling if the root project has a name in the manifest file.
|
|
608
635
|
if project_name == BASE_PROJECT_NAME and manifest.project_name:
|
|
609
636
|
if (
|
|
@@ -663,21 +690,27 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
663
690
|
project_visited.add(project_name)
|
|
664
691
|
else:
|
|
665
692
|
self._recursively_check_manifests(
|
|
666
|
-
tmp_dir,
|
|
693
|
+
tmp_dir,
|
|
694
|
+
remote_project.name,
|
|
695
|
+
project_visited,
|
|
696
|
+
manifest_constants,
|
|
667
697
|
)
|
|
668
698
|
|
|
669
699
|
for project in manifest.local_dependencies:
|
|
670
|
-
self._recursively_check_manifests(
|
|
700
|
+
self._recursively_check_manifests(
|
|
701
|
+
tmp_dir, project, project_visited, manifest_constants
|
|
702
|
+
)
|
|
671
703
|
|
|
672
704
|
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
|
|
673
705
|
assert self.source_config.base_folder
|
|
674
|
-
|
|
675
706
|
viewfile_loader = LookerViewFileLoader(
|
|
676
707
|
self.source_config.project_name,
|
|
677
708
|
self.base_projects_folder,
|
|
678
709
|
self.reporter,
|
|
679
710
|
self.source_config,
|
|
711
|
+
self.manifest_constants,
|
|
680
712
|
)
|
|
713
|
+
logger.debug(f"LookML Constants : {', '.join(self.manifest_constants.keys())}")
|
|
681
714
|
|
|
682
715
|
# Some views can be mentioned by multiple 'include' statements and can be included via different connections.
|
|
683
716
|
|
|
@@ -884,6 +917,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
884
917
|
view_urn = maybe_looker_view.id.get_urn(
|
|
885
918
|
self.source_config
|
|
886
919
|
)
|
|
920
|
+
|
|
887
921
|
view_connection_mapping = view_connection_map.get(
|
|
888
922
|
view_urn
|
|
889
923
|
)
|
|
@@ -939,6 +973,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
939
973
|
str(maybe_looker_view.id)
|
|
940
974
|
)
|
|
941
975
|
|
|
976
|
+
if not self.source_config.emit_reachable_views_only:
|
|
977
|
+
self.report_skipped_unreachable_views(viewfile_loader, processed_view_map)
|
|
978
|
+
|
|
942
979
|
if (
|
|
943
980
|
self.source_config.tag_measures_and_dimensions
|
|
944
981
|
and self.reporter.events_produced != 0
|
|
@@ -966,5 +1003,56 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
966
1003
|
),
|
|
967
1004
|
).as_workunit()
|
|
968
1005
|
|
|
1006
|
+
def report_skipped_unreachable_views(
|
|
1007
|
+
self,
|
|
1008
|
+
viewfile_loader: LookerViewFileLoader,
|
|
1009
|
+
processed_view_map: Dict[str, Set[str]] = {},
|
|
1010
|
+
) -> None:
|
|
1011
|
+
view_files: Dict[str, List[pathlib.Path]] = {}
|
|
1012
|
+
for project, folder_path in self.base_projects_folder.items():
|
|
1013
|
+
folder = pathlib.Path(folder_path)
|
|
1014
|
+
view_files[project] = list(folder.glob(f"**/*{VIEW_FILE_EXTENSION}"))
|
|
1015
|
+
|
|
1016
|
+
skipped_view_paths: Dict[str, List[str]] = {}
|
|
1017
|
+
for project, views in view_files.items():
|
|
1018
|
+
skipped_paths: Set[str] = set()
|
|
1019
|
+
|
|
1020
|
+
for view_path in views:
|
|
1021
|
+
# Check if the view is already in processed_view_map
|
|
1022
|
+
if not any(
|
|
1023
|
+
str(view_path) in view_set
|
|
1024
|
+
for view_set in processed_view_map.values()
|
|
1025
|
+
):
|
|
1026
|
+
looker_viewfile = viewfile_loader.load_viewfile(
|
|
1027
|
+
path=str(view_path),
|
|
1028
|
+
project_name=project,
|
|
1029
|
+
connection=None,
|
|
1030
|
+
reporter=self.reporter,
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
if looker_viewfile is not None:
|
|
1034
|
+
for raw_view in looker_viewfile.views:
|
|
1035
|
+
raw_view_name = raw_view.get("name", "")
|
|
1036
|
+
|
|
1037
|
+
if (
|
|
1038
|
+
raw_view_name
|
|
1039
|
+
and self.source_config.view_pattern.allowed(
|
|
1040
|
+
raw_view_name
|
|
1041
|
+
)
|
|
1042
|
+
):
|
|
1043
|
+
skipped_paths.add(str(view_path))
|
|
1044
|
+
|
|
1045
|
+
skipped_view_paths[project] = list(skipped_paths)
|
|
1046
|
+
|
|
1047
|
+
for project, view_paths in skipped_view_paths.items():
|
|
1048
|
+
for path in view_paths:
|
|
1049
|
+
self.reporter.report_warning(
|
|
1050
|
+
title="Skipped View File",
|
|
1051
|
+
message=(
|
|
1052
|
+
"The Looker view file was skipped because it may not be referenced by any models."
|
|
1053
|
+
),
|
|
1054
|
+
context=(f"Project: {project}, View File Path: {path}"),
|
|
1055
|
+
)
|
|
1056
|
+
|
|
969
1057
|
def get_report(self):
|
|
970
1058
|
return self.reporter
|
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
import os.path
|
|
7
7
|
import platform
|
|
8
|
+
import re
|
|
8
9
|
from dataclasses import dataclass
|
|
9
10
|
from typing import Dict, Iterable, List, Optional, Union
|
|
10
11
|
|
|
@@ -33,6 +34,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
33
34
|
from datahub.ingestion.source.snowflake.constants import (
|
|
34
35
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
35
36
|
SnowflakeEdition,
|
|
37
|
+
SnowflakeObjectDomain,
|
|
36
38
|
)
|
|
37
39
|
from datahub.ingestion.source.snowflake.snowflake_assertion import (
|
|
38
40
|
SnowflakeAssertionsHandler,
|
|
@@ -162,6 +164,8 @@ class SnowflakeV2Source(
|
|
|
162
164
|
self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
|
|
163
165
|
self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
|
|
164
166
|
|
|
167
|
+
self.discovered_datasets: Optional[List[str]] = None
|
|
168
|
+
|
|
165
169
|
self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
|
|
166
170
|
SqlParsingAggregator(
|
|
167
171
|
platform=self.identifiers.platform,
|
|
@@ -182,6 +186,8 @@ class SnowflakeV2Source(
|
|
|
182
186
|
generate_usage_statistics=False,
|
|
183
187
|
generate_operations=False,
|
|
184
188
|
format_queries=self.config.format_sql_queries,
|
|
189
|
+
is_temp_table=self._is_temp_table,
|
|
190
|
+
is_allowed_table=self._is_allowed_table,
|
|
185
191
|
)
|
|
186
192
|
)
|
|
187
193
|
self.report.sql_aggregator = self.aggregator.report
|
|
@@ -444,6 +450,34 @@ class SnowflakeV2Source(
|
|
|
444
450
|
|
|
445
451
|
return _report
|
|
446
452
|
|
|
453
|
+
def _is_temp_table(self, name: str) -> bool:
|
|
454
|
+
if any(
|
|
455
|
+
re.match(pattern, name, flags=re.IGNORECASE)
|
|
456
|
+
for pattern in self.config.temporary_tables_pattern
|
|
457
|
+
):
|
|
458
|
+
return True
|
|
459
|
+
|
|
460
|
+
# This is also a temp table if
|
|
461
|
+
# 1. this name would be allowed by the dataset patterns, and
|
|
462
|
+
# 2. we have a list of discovered tables, and
|
|
463
|
+
# 3. it's not in the discovered tables list
|
|
464
|
+
if (
|
|
465
|
+
self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE)
|
|
466
|
+
and self.discovered_datasets
|
|
467
|
+
and name not in self.discovered_datasets
|
|
468
|
+
):
|
|
469
|
+
return True
|
|
470
|
+
|
|
471
|
+
return False
|
|
472
|
+
|
|
473
|
+
def _is_allowed_table(self, name: str) -> bool:
|
|
474
|
+
if self.discovered_datasets and name not in self.discovered_datasets:
|
|
475
|
+
return False
|
|
476
|
+
|
|
477
|
+
return self.filters.is_dataset_pattern_allowed(
|
|
478
|
+
name, SnowflakeObjectDomain.TABLE
|
|
479
|
+
)
|
|
480
|
+
|
|
447
481
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
448
482
|
return [
|
|
449
483
|
*super().get_workunit_processors(),
|
|
@@ -513,7 +547,7 @@ class SnowflakeV2Source(
|
|
|
513
547
|
)
|
|
514
548
|
return
|
|
515
549
|
|
|
516
|
-
discovered_datasets = discovered_tables + discovered_views
|
|
550
|
+
self.discovered_datasets = discovered_tables + discovered_views
|
|
517
551
|
|
|
518
552
|
if self.config.use_queries_v2:
|
|
519
553
|
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
@@ -538,13 +572,14 @@ class SnowflakeV2Source(
|
|
|
538
572
|
filters=self.filters,
|
|
539
573
|
identifiers=self.identifiers,
|
|
540
574
|
schema_resolver=schema_resolver,
|
|
541
|
-
discovered_tables=discovered_datasets,
|
|
575
|
+
discovered_tables=self.discovered_datasets,
|
|
542
576
|
graph=self.ctx.graph,
|
|
543
577
|
)
|
|
544
578
|
|
|
545
579
|
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
546
580
|
# but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
|
|
547
581
|
# it should be pretty straightforward to refactor this and only initialize the aggregator once.
|
|
582
|
+
# This also applies for the _is_temp_table and _is_allowed_table methods above, duplicated from SnowflakeQueriesExtractor.
|
|
548
583
|
self.report.queries_extractor = queries_extractor.report
|
|
549
584
|
yield from queries_extractor.get_workunits_internal()
|
|
550
585
|
queries_extractor.close()
|
|
@@ -568,12 +603,14 @@ class SnowflakeV2Source(
|
|
|
568
603
|
if (
|
|
569
604
|
self.config.include_usage_stats or self.config.include_operational_stats
|
|
570
605
|
) and self.usage_extractor:
|
|
571
|
-
yield from self.usage_extractor.get_usage_workunits(
|
|
606
|
+
yield from self.usage_extractor.get_usage_workunits(
|
|
607
|
+
self.discovered_datasets
|
|
608
|
+
)
|
|
572
609
|
|
|
573
610
|
if self.config.include_assertion_results:
|
|
574
611
|
yield from SnowflakeAssertionsHandler(
|
|
575
612
|
self.config, self.report, self.connection, self.identifiers
|
|
576
|
-
).get_assertion_workunits(discovered_datasets)
|
|
613
|
+
).get_assertion_workunits(self.discovered_datasets)
|
|
577
614
|
|
|
578
615
|
self.connection.close()
|
|
579
616
|
|
|
File without changes
|
|
File without changes
|
{acryl_datahub-0.15.0.5rc3.dist-info → acryl_datahub-0.15.0.5rc5.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|