acryl-datahub 1.2.0.11rc1__py3-none-any.whl → 1.2.0.11rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/RECORD +39 -37
  3. datahub/_version.py +1 -1
  4. datahub/cli/docker_cli.py +1 -1
  5. datahub/configuration/common.py +11 -0
  6. datahub/configuration/kafka.py +19 -1
  7. datahub/configuration/validate_field_removal.py +3 -0
  8. datahub/ingestion/autogenerated/capability_summary.json +2 -2
  9. datahub/ingestion/graph/client.py +7 -7
  10. datahub/ingestion/graph/filters.py +30 -11
  11. datahub/ingestion/source/aws/s3_boto_utils.py +4 -1
  12. datahub/ingestion/source/data_lake_common/path_spec.py +39 -2
  13. datahub/ingestion/source/looker/looker_common.py +6 -0
  14. datahub/ingestion/source/looker/looker_constant.py +4 -0
  15. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  16. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  17. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  18. datahub/ingestion/source/looker/lookml_config.py +30 -2
  19. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  20. datahub/ingestion/source/looker/lookml_source.py +42 -29
  21. datahub/ingestion/source/looker/view_upstream.py +494 -1
  22. datahub/ingestion/source/s3/source.py +125 -164
  23. datahub/ingestion/source/snaplogic/snaplogic.py +4 -4
  24. datahub/ingestion/source/snaplogic/snaplogic_config.py +4 -4
  25. datahub/ingestion/source/snowflake/snowflake_utils.py +9 -9
  26. datahub/metadata/_internal_schema_classes.py +1 -1
  27. datahub/metadata/schema.avsc +1 -1
  28. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  29. datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
  30. datahub/sdk/search_filters.py +122 -1
  31. datahub/secret/datahub_secret_store.py +3 -0
  32. datahub/secret/environment_secret_store.py +29 -0
  33. datahub/secret/file_secret_store.py +49 -0
  34. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  35. datahub/sql_parsing/sqlglot_lineage.py +6 -1
  36. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/WHEEL +0 -0
  37. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/entry_points.txt +0 -0
  38. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/licenses/LICENSE +0 -0
  39. {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  import json
3
3
  import logging
4
4
  import os
5
+ from enum import Enum
5
6
  from functools import lru_cache
6
7
  from typing import Dict, List, MutableMapping, Optional, Sequence, Set, Union, cast
7
8
 
@@ -31,6 +32,14 @@ from datahub.configuration.common import ConfigurationError
31
32
  logger = logging.getLogger(__name__)
32
33
 
33
34
 
35
+ class LookerQueryResponseFormat(Enum):
36
+ # result_format - Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query
37
+ JSON = "json"
38
+ SQL = (
39
+ "sql" # Note: This does not execute the query, it only generates the SQL query.
40
+ )
41
+
42
+
34
43
  class TransportOptionsConfig(ConfigModel):
35
44
  timeout: int
36
45
  headers: MutableMapping[str, str]
@@ -69,6 +78,7 @@ class LookerAPIStats(BaseModel):
69
78
  search_looks_calls: int = 0
70
79
  search_dashboards_calls: int = 0
71
80
  all_user_calls: int = 0
81
+ generate_sql_query_calls: int = 0
72
82
 
73
83
 
74
84
  class LookerAPI:
@@ -170,17 +180,40 @@ class LookerAPI:
170
180
  logger.debug(f"Executing query {write_query}")
171
181
  self.client_stats.query_calls += 1
172
182
 
173
- response_json = self.client.run_inline_query(
174
- result_format="json",
183
+ response = self.client.run_inline_query(
184
+ result_format=LookerQueryResponseFormat.JSON.value,
175
185
  body=write_query,
176
186
  transport_options=self.transport_options,
177
187
  )
178
188
 
189
+ data = json.loads(response)
190
+
179
191
  logger.debug("=================Response=================")
180
- data = json.loads(response_json)
181
192
  logger.debug("Length of response: %d", len(data))
182
193
  return data
183
194
 
195
+ def generate_sql_query(
196
+ self, write_query: WriteQuery, use_cache: bool = False
197
+ ) -> str:
198
+ """
199
+ Generates a SQL query string for a given WriteQuery.
200
+
201
+ Note: This does not execute the query, it only generates the SQL query.
202
+ """
203
+ logger.debug(f"Generating SQL query for {write_query}")
204
+ self.client_stats.generate_sql_query_calls += 1
205
+
206
+ response = self.client.run_inline_query(
207
+ result_format=LookerQueryResponseFormat.SQL.value,
208
+ body=write_query,
209
+ transport_options=self.transport_options,
210
+ cache=use_cache,
211
+ )
212
+
213
+ logger.debug("=================Response=================")
214
+ logger.debug("Length of SQL response: %d", len(response))
215
+ return str(response)
216
+
184
217
  def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboard:
185
218
  self.client_stats.dashboard_calls += 1
186
219
  return self.client.dashboard(
@@ -3,11 +3,11 @@ from typing import Dict, List, Optional
3
3
 
4
4
  from datahub.ingestion.source.looker.looker_common import LookerViewId, ViewFieldValue
5
5
  from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
6
+ from datahub.ingestion.source.looker.looker_constant import NAME
6
7
  from datahub.ingestion.source.looker.looker_dataclasses import LookerModel
7
8
  from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
8
9
  from datahub.ingestion.source.looker.lookml_config import (
9
10
  BASE_PROJECT_NAME,
10
- NAME,
11
11
  LookMLSourceReport,
12
12
  )
13
13
 
@@ -12,12 +12,12 @@ from datahub.ingestion.source.looker.looker_constant import (
12
12
  DIMENSION_GROUPS,
13
13
  DIMENSIONS,
14
14
  MEASURES,
15
+ NAME,
15
16
  )
16
17
  from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile
17
18
  from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
18
19
  from datahub.ingestion.source.looker.lookml_config import (
19
20
  DERIVED_VIEW_SUFFIX,
20
- NAME,
21
21
  LookMLSourceReport,
22
22
  )
23
23
  from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver
@@ -28,11 +28,10 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
28
28
  StatefulIngestionConfigBase,
29
29
  )
30
30
  from datahub.utilities.lossy_collections import LossyList
31
+ from datahub.utilities.stats_collections import TopKDict, float_top_k_dict
31
32
 
32
33
  logger = logging.getLogger(__name__)
33
34
 
34
- NAME: str = "name"
35
-
36
35
  BASE_PROJECT_NAME = "__BASE"
37
36
 
38
37
  EXPLORE_FILE_EXTENSION = ".explore.lkml"
@@ -47,6 +46,9 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
47
46
  @dataclass
48
47
  class LookMLSourceReport(StaleEntityRemovalSourceReport):
49
48
  git_clone_latency: Optional[timedelta] = None
49
+ looker_query_api_latency_seconds: TopKDict[str, float] = dataclass_field(
50
+ default_factory=float_top_k_dict
51
+ )
50
52
  models_discovered: int = 0
51
53
  models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
52
54
  views_discovered: int = 0
@@ -81,6 +83,11 @@ class LookMLSourceReport(StaleEntityRemovalSourceReport):
81
83
  self.api_stats = self._looker_api.compute_stats()
82
84
  return super().compute_stats()
83
85
 
86
+ def report_looker_query_api_latency(
87
+ self, view_urn: str, latency: timedelta
88
+ ) -> None:
89
+ self.looker_query_api_latency_seconds[view_urn] = latency.total_seconds()
90
+
84
91
 
85
92
  class LookMLSourceConfig(
86
93
  LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin
@@ -122,6 +129,16 @@ class LookMLSourceConfig(
122
129
  description="List of regex patterns for LookML views to include in the extraction.",
123
130
  )
124
131
  parse_table_names_from_sql: bool = Field(True, description="See note below.")
132
+ use_api_for_view_lineage: bool = Field(
133
+ False,
134
+ description="When enabled, uses Looker API to get SQL representation of views for lineage parsing instead of parsing LookML files directly. Requires 'api' configuration to be provided."
135
+ "Coverage of regex based lineage extraction has limitations, it only supportes ${TABLE}.column_name syntax, See (https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions) to"
136
+ "understand the other substitutions and cross-references allowed in LookML.",
137
+ )
138
+ use_api_cache_for_view_lineage: bool = Field(
139
+ False,
140
+ description="When enabled, uses Looker API server-side caching for query execution. Requires 'api' configuration to be provided.",
141
+ )
125
142
  api: Optional[LookerAPIConfig] = None
126
143
  project_name: Optional[str] = Field(
127
144
  None,
@@ -239,6 +256,17 @@ class LookMLSourceConfig(
239
256
  )
240
257
  return values
241
258
 
259
+ @root_validator(skip_on_failure=True)
260
+ def check_api_provided_for_view_lineage(cls, values):
261
+ """Validate that we must have an api credential to use Looker API for view's column lineage"""
262
+ if not values.get("api") and values.get("use_api_for_view_lineage"):
263
+ raise ValueError(
264
+ "API credential was not found. LookML source requires api credentials "
265
+ "for Looker to use Looker APIs for view's column lineage extraction."
266
+ "Set `use_api_for_view_lineage` to False to skip using Looker APIs."
267
+ )
268
+ return values
269
+
242
270
  @validator("base_folder", always=True)
243
271
  def check_base_folder_if_not_provided(
244
272
  cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any]
@@ -4,10 +4,10 @@ import logging
4
4
  from typing import ClassVar, Dict, List, Set
5
5
 
6
6
  from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
7
+ from datahub.ingestion.source.looker.looker_constant import NAME
7
8
  from datahub.ingestion.source.looker.looker_dataclasses import LookerModel
8
9
  from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
9
10
  from datahub.ingestion.source.looker.lookml_config import (
10
- NAME,
11
11
  LookMLSourceConfig,
12
12
  LookMLSourceReport,
13
13
  )
@@ -142,6 +142,8 @@ class LookerView:
142
142
  ctx: PipelineContext,
143
143
  extract_col_level_lineage: bool = False,
144
144
  populate_sql_logic_in_descriptions: bool = False,
145
+ looker_client: Optional[LookerAPI] = None,
146
+ view_to_explore_map: Optional[Dict[str, str]] = None,
145
147
  ) -> Optional["LookerView"]:
146
148
  view_name = view_context.name()
147
149
 
@@ -160,6 +162,8 @@ class LookerView:
160
162
  config=config,
161
163
  ctx=ctx,
162
164
  reporter=reporter,
165
+ looker_client=looker_client,
166
+ view_to_explore_map=view_to_explore_map,
163
167
  )
164
168
 
165
169
  field_type_vs_raw_fields = OrderedDict(
@@ -705,6 +709,11 @@ class LookMLSource(StatefulIngestionSourceBase):
705
709
  # Value: Tuple(model file name, connection name)
706
710
  view_connection_map: Dict[str, Tuple[str, str]] = {}
707
711
 
712
+ # Map of view name to explore name for API-based view lineage
713
+ # A view can be referenced by multiple explores, we only need one of the explores to use Looker Query API
714
+ # Key: view_name, Value: explore_name
715
+ view_to_explore_map: Dict[str, str] = {}
716
+
708
717
  # The ** means "this directory and all subdirectories", and hence should
709
718
  # include all the files we want.
710
719
  model_files = sorted(
@@ -759,37 +768,37 @@ class LookMLSource(StatefulIngestionSourceBase):
759
768
  )
760
769
  )
761
770
 
762
- if self.source_config.emit_reachable_views_only:
763
- model_explores_map = {d["name"]: d for d in model.explores}
764
- for explore_dict in model.explores:
765
- try:
766
- if LookerRefinementResolver.is_refinement(explore_dict["name"]):
767
- continue
771
+ model_explores_map = {d["name"]: d for d in model.explores}
772
+ for explore_dict in model.explores:
773
+ try:
774
+ if LookerRefinementResolver.is_refinement(explore_dict["name"]):
775
+ continue
768
776
 
769
- explore_dict = (
770
- looker_refinement_resolver.apply_explore_refinement(
771
- explore_dict
772
- )
773
- )
774
- explore: LookerExplore = LookerExplore.from_dict(
775
- model_name,
776
- explore_dict,
777
- model.resolved_includes,
778
- viewfile_loader,
779
- self.reporter,
780
- model_explores_map,
781
- )
782
- if explore.upstream_views:
783
- for view_name in explore.upstream_views:
777
+ explore_dict = looker_refinement_resolver.apply_explore_refinement(
778
+ explore_dict
779
+ )
780
+ explore: LookerExplore = LookerExplore.from_dict(
781
+ model_name,
782
+ explore_dict,
783
+ model.resolved_includes,
784
+ viewfile_loader,
785
+ self.reporter,
786
+ model_explores_map,
787
+ )
788
+ if explore.upstream_views:
789
+ for view_name in explore.upstream_views:
790
+ if self.source_config.emit_reachable_views_only:
784
791
  explore_reachable_views.add(view_name.include)
785
- except Exception as e:
786
- self.reporter.report_warning(
787
- title="Failed to process explores",
788
- message="Failed to process explore dictionary.",
789
- context=f"Explore Details: {explore_dict}",
790
- exc=e,
791
- )
792
- logger.debug("Failed to process explore", exc_info=e)
792
+ # Build view to explore mapping for API-based view lineage
793
+ view_to_explore_map[view_name.include] = explore.name
794
+ except Exception as e:
795
+ self.reporter.report_warning(
796
+ title="Failed to process explores",
797
+ message="Failed to process explore dictionary.",
798
+ context=f"Explore Details: {explore_dict}",
799
+ exc=e,
800
+ )
801
+ logger.debug("Failed to process explore", exc_info=e)
793
802
 
794
803
  processed_view_files = processed_view_map.setdefault(
795
804
  model.connection, set()
@@ -878,6 +887,10 @@ class LookMLSource(StatefulIngestionSourceBase):
878
887
  populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions,
879
888
  config=self.source_config,
880
889
  ctx=self.ctx,
890
+ looker_client=self.looker_client,
891
+ view_to_explore_map=view_to_explore_map
892
+ if view_to_explore_map
893
+ else None,
881
894
  )
882
895
  except Exception as e:
883
896
  self.reporter.report_warning(