acryl-datahub 0.15.0.5rc3__py3-none-any.whl → 0.15.0.5rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -43,6 +43,7 @@ from datahub.ingestion.source.looker.looker_common import (
43
43
  from datahub.ingestion.source.looker.looker_connection import (
44
44
  get_connection_def_based_on_connection_string,
45
45
  )
46
+ from datahub.ingestion.source.looker.looker_dataclasses import LookerConstant
46
47
  from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
47
48
  from datahub.ingestion.source.looker.looker_template_language import (
48
49
  load_and_preprocess_file,
@@ -59,6 +60,7 @@ from datahub.ingestion.source.looker.lookml_concept_context import (
59
60
  from datahub.ingestion.source.looker.lookml_config import (
60
61
  BASE_PROJECT_NAME,
61
62
  MODEL_FILE_EXTENSION,
63
+ VIEW_FILE_EXTENSION,
62
64
  LookerConnectionDefinition,
63
65
  LookMLSourceConfig,
64
66
  LookMLSourceReport,
@@ -253,6 +255,7 @@ class LookerManifest:
253
255
  # This must be set if the manifest has local_dependency entries.
254
256
  # See https://cloud.google.com/looker/docs/reference/param-manifest-project-name
255
257
  project_name: Optional[str]
258
+ constants: Optional[List[Dict[str, str]]]
256
259
 
257
260
  local_dependencies: List[str]
258
261
  remote_dependencies: List[LookerRemoteDependency]
@@ -309,11 +312,14 @@ class LookMLSource(StatefulIngestionSourceBase):
309
312
  "manage_models permission enabled on this API key."
310
313
  ) from err
311
314
 
315
+ self.manifest_constants: Dict[str, "LookerConstant"] = {}
316
+
312
317
  def _load_model(self, path: str) -> LookerModel:
313
318
  logger.debug(f"Loading model from file {path}")
314
319
 
315
320
  parsed = load_and_preprocess_file(
316
321
  path=path,
322
+ reporter=self.reporter,
317
323
  source_config=self.source_config,
318
324
  )
319
325
 
@@ -499,27 +505,33 @@ class LookMLSource(StatefulIngestionSourceBase):
499
505
 
500
506
  def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]:
501
507
  manifest_file = folder / "manifest.lkml"
502
- if manifest_file.exists():
503
- manifest_dict = load_and_preprocess_file(
504
- path=manifest_file, source_config=self.source_config
505
- )
506
508
 
507
- manifest = LookerManifest(
508
- project_name=manifest_dict.get("project_name"),
509
- local_dependencies=[
510
- x["project"] for x in manifest_dict.get("local_dependencys", [])
511
- ],
512
- remote_dependencies=[
513
- LookerRemoteDependency(
514
- name=x["name"], url=x["url"], ref=x.get("ref")
515
- )
516
- for x in manifest_dict.get("remote_dependencys", [])
517
- ],
509
+ if not manifest_file.exists():
510
+ self.reporter.info(
511
+ message="manifest.lkml file missing from project",
512
+ context=str(manifest_file),
518
513
  )
519
- return manifest
520
- else:
521
514
  return None
522
515
 
516
+ manifest_dict = load_and_preprocess_file(
517
+ path=manifest_file,
518
+ source_config=self.source_config,
519
+ reporter=self.reporter,
520
+ )
521
+
522
+ manifest = LookerManifest(
523
+ project_name=manifest_dict.get("project_name"),
524
+ constants=manifest_dict.get("constants", []),
525
+ local_dependencies=[
526
+ x["project"] for x in manifest_dict.get("local_dependencys", [])
527
+ ],
528
+ remote_dependencies=[
529
+ LookerRemoteDependency(name=x["name"], url=x["url"], ref=x.get("ref"))
530
+ for x in manifest_dict.get("remote_dependencys", [])
531
+ ],
532
+ )
533
+ return manifest
534
+
523
535
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
524
536
  return [
525
537
  *super().get_workunit_processors(),
@@ -574,7 +586,10 @@ class LookMLSource(StatefulIngestionSourceBase):
574
586
  self.base_projects_folder[project] = p_ref
575
587
 
576
588
  self._recursively_check_manifests(
577
- tmp_dir, BASE_PROJECT_NAME, visited_projects
589
+ tmp_dir,
590
+ BASE_PROJECT_NAME,
591
+ visited_projects,
592
+ self.manifest_constants,
578
593
  )
579
594
 
580
595
  yield from self.get_internal_workunits()
@@ -587,7 +602,11 @@ class LookMLSource(StatefulIngestionSourceBase):
587
602
  )
588
603
 
589
604
  def _recursively_check_manifests(
590
- self, tmp_dir: str, project_name: str, project_visited: Set[str]
605
+ self,
606
+ tmp_dir: str,
607
+ project_name: str,
608
+ project_visited: Set[str],
609
+ manifest_constants: Dict[str, "LookerConstant"],
591
610
  ) -> None:
592
611
  if project_name in project_visited:
593
612
  return
@@ -604,6 +623,14 @@ class LookMLSource(StatefulIngestionSourceBase):
604
623
  if not manifest:
605
624
  return
606
625
 
626
+ if manifest.constants:
627
+ for constant in manifest.constants:
628
+ if constant.get("name") and constant.get("value"):
629
+ manifest_constants[constant["name"]] = LookerConstant(
630
+ name=constant["name"],
631
+ value=constant["value"],
632
+ )
633
+
607
634
  # Special case handling if the root project has a name in the manifest file.
608
635
  if project_name == BASE_PROJECT_NAME and manifest.project_name:
609
636
  if (
@@ -663,21 +690,27 @@ class LookMLSource(StatefulIngestionSourceBase):
663
690
  project_visited.add(project_name)
664
691
  else:
665
692
  self._recursively_check_manifests(
666
- tmp_dir, remote_project.name, project_visited
693
+ tmp_dir,
694
+ remote_project.name,
695
+ project_visited,
696
+ manifest_constants,
667
697
  )
668
698
 
669
699
  for project in manifest.local_dependencies:
670
- self._recursively_check_manifests(tmp_dir, project, project_visited)
700
+ self._recursively_check_manifests(
701
+ tmp_dir, project, project_visited, manifest_constants
702
+ )
671
703
 
672
704
  def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
673
705
  assert self.source_config.base_folder
674
-
675
706
  viewfile_loader = LookerViewFileLoader(
676
707
  self.source_config.project_name,
677
708
  self.base_projects_folder,
678
709
  self.reporter,
679
710
  self.source_config,
711
+ self.manifest_constants,
680
712
  )
713
+ logger.debug(f"LookML Constants : {', '.join(self.manifest_constants.keys())}")
681
714
 
682
715
  # Some views can be mentioned by multiple 'include' statements and can be included via different connections.
683
716
 
@@ -884,6 +917,7 @@ class LookMLSource(StatefulIngestionSourceBase):
884
917
  view_urn = maybe_looker_view.id.get_urn(
885
918
  self.source_config
886
919
  )
920
+
887
921
  view_connection_mapping = view_connection_map.get(
888
922
  view_urn
889
923
  )
@@ -939,6 +973,9 @@ class LookMLSource(StatefulIngestionSourceBase):
939
973
  str(maybe_looker_view.id)
940
974
  )
941
975
 
976
+ if not self.source_config.emit_reachable_views_only:
977
+ self.report_skipped_unreachable_views(viewfile_loader, processed_view_map)
978
+
942
979
  if (
943
980
  self.source_config.tag_measures_and_dimensions
944
981
  and self.reporter.events_produced != 0
@@ -966,5 +1003,56 @@ class LookMLSource(StatefulIngestionSourceBase):
966
1003
  ),
967
1004
  ).as_workunit()
968
1005
 
1006
+ def report_skipped_unreachable_views(
1007
+ self,
1008
+ viewfile_loader: LookerViewFileLoader,
1009
+ processed_view_map: Dict[str, Set[str]] = {},
1010
+ ) -> None:
1011
+ view_files: Dict[str, List[pathlib.Path]] = {}
1012
+ for project, folder_path in self.base_projects_folder.items():
1013
+ folder = pathlib.Path(folder_path)
1014
+ view_files[project] = list(folder.glob(f"**/*{VIEW_FILE_EXTENSION}"))
1015
+
1016
+ skipped_view_paths: Dict[str, List[str]] = {}
1017
+ for project, views in view_files.items():
1018
+ skipped_paths: Set[str] = set()
1019
+
1020
+ for view_path in views:
1021
+ # Check if the view is already in processed_view_map
1022
+ if not any(
1023
+ str(view_path) in view_set
1024
+ for view_set in processed_view_map.values()
1025
+ ):
1026
+ looker_viewfile = viewfile_loader.load_viewfile(
1027
+ path=str(view_path),
1028
+ project_name=project,
1029
+ connection=None,
1030
+ reporter=self.reporter,
1031
+ )
1032
+
1033
+ if looker_viewfile is not None:
1034
+ for raw_view in looker_viewfile.views:
1035
+ raw_view_name = raw_view.get("name", "")
1036
+
1037
+ if (
1038
+ raw_view_name
1039
+ and self.source_config.view_pattern.allowed(
1040
+ raw_view_name
1041
+ )
1042
+ ):
1043
+ skipped_paths.add(str(view_path))
1044
+
1045
+ skipped_view_paths[project] = list(skipped_paths)
1046
+
1047
+ for project, view_paths in skipped_view_paths.items():
1048
+ for path in view_paths:
1049
+ self.reporter.report_warning(
1050
+ title="Skipped View File",
1051
+ message=(
1052
+ "The Looker view file was skipped because it may not be referenced by any models."
1053
+ ),
1054
+ context=(f"Project: {project}, View File Path: {path}"),
1055
+ )
1056
+
969
1057
  def get_report(self):
970
1058
  return self.reporter
@@ -5,6 +5,7 @@ import logging
5
5
  import os
6
6
  import os.path
7
7
  import platform
8
+ import re
8
9
  from dataclasses import dataclass
9
10
  from typing import Dict, Iterable, List, Optional, Union
10
11
 
@@ -33,6 +34,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
33
34
  from datahub.ingestion.source.snowflake.constants import (
34
35
  GENERIC_PERMISSION_ERROR_KEY,
35
36
  SnowflakeEdition,
37
+ SnowflakeObjectDomain,
36
38
  )
37
39
  from datahub.ingestion.source.snowflake.snowflake_assertion import (
38
40
  SnowflakeAssertionsHandler,
@@ -162,6 +164,8 @@ class SnowflakeV2Source(
162
164
  self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
163
165
  self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None
164
166
 
167
+ self.discovered_datasets: Optional[List[str]] = None
168
+
165
169
  self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
166
170
  SqlParsingAggregator(
167
171
  platform=self.identifiers.platform,
@@ -182,6 +186,8 @@ class SnowflakeV2Source(
182
186
  generate_usage_statistics=False,
183
187
  generate_operations=False,
184
188
  format_queries=self.config.format_sql_queries,
189
+ is_temp_table=self._is_temp_table,
190
+ is_allowed_table=self._is_allowed_table,
185
191
  )
186
192
  )
187
193
  self.report.sql_aggregator = self.aggregator.report
@@ -444,6 +450,34 @@ class SnowflakeV2Source(
444
450
 
445
451
  return _report
446
452
 
453
+ def _is_temp_table(self, name: str) -> bool:
454
+ if any(
455
+ re.match(pattern, name, flags=re.IGNORECASE)
456
+ for pattern in self.config.temporary_tables_pattern
457
+ ):
458
+ return True
459
+
460
+ # This is also a temp table if
461
+ # 1. this name would be allowed by the dataset patterns, and
462
+ # 2. we have a list of discovered tables, and
463
+ # 3. it's not in the discovered tables list
464
+ if (
465
+ self.filters.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE)
466
+ and self.discovered_datasets
467
+ and name not in self.discovered_datasets
468
+ ):
469
+ return True
470
+
471
+ return False
472
+
473
+ def _is_allowed_table(self, name: str) -> bool:
474
+ if self.discovered_datasets and name not in self.discovered_datasets:
475
+ return False
476
+
477
+ return self.filters.is_dataset_pattern_allowed(
478
+ name, SnowflakeObjectDomain.TABLE
479
+ )
480
+
447
481
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
448
482
  return [
449
483
  *super().get_workunit_processors(),
@@ -513,7 +547,7 @@ class SnowflakeV2Source(
513
547
  )
514
548
  return
515
549
 
516
- discovered_datasets = discovered_tables + discovered_views
550
+ self.discovered_datasets = discovered_tables + discovered_views
517
551
 
518
552
  if self.config.use_queries_v2:
519
553
  with self.report.new_stage(f"*: {VIEW_PARSING}"):
@@ -538,13 +572,14 @@ class SnowflakeV2Source(
538
572
  filters=self.filters,
539
573
  identifiers=self.identifiers,
540
574
  schema_resolver=schema_resolver,
541
- discovered_tables=discovered_datasets,
575
+ discovered_tables=self.discovered_datasets,
542
576
  graph=self.ctx.graph,
543
577
  )
544
578
 
545
579
  # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
546
580
  # but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
547
581
  # it should be pretty straightforward to refactor this and only initialize the aggregator once.
582
+ # This also applies for the _is_temp_table and _is_allowed_table methods above, duplicated from SnowflakeQueriesExtractor.
548
583
  self.report.queries_extractor = queries_extractor.report
549
584
  yield from queries_extractor.get_workunits_internal()
550
585
  queries_extractor.close()
@@ -568,12 +603,14 @@ class SnowflakeV2Source(
568
603
  if (
569
604
  self.config.include_usage_stats or self.config.include_operational_stats
570
605
  ) and self.usage_extractor:
571
- yield from self.usage_extractor.get_usage_workunits(discovered_datasets)
606
+ yield from self.usage_extractor.get_usage_workunits(
607
+ self.discovered_datasets
608
+ )
572
609
 
573
610
  if self.config.include_assertion_results:
574
611
  yield from SnowflakeAssertionsHandler(
575
612
  self.config, self.report, self.connection, self.identifiers
576
- ).get_assertion_workunits(discovered_datasets)
613
+ ).get_assertion_workunits(self.discovered_datasets)
577
614
 
578
615
  self.connection.close()
579
616