acryl-datahub 1.2.0.11rc1__py3-none-any.whl → 1.2.0.11rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/METADATA +2557 -2557
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/RECORD +39 -37
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/configuration/common.py +11 -0
- datahub/configuration/kafka.py +19 -1
- datahub/configuration/validate_field_removal.py +3 -0
- datahub/ingestion/autogenerated/capability_summary.json +2 -2
- datahub/ingestion/graph/client.py +7 -7
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +4 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +39 -2
- datahub/ingestion/source/looker/looker_common.py +6 -0
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +30 -2
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +42 -29
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/s3/source.py +125 -164
- datahub/ingestion/source/snaplogic/snaplogic.py +4 -4
- datahub/ingestion/source/snaplogic/snaplogic_config.py +4 -4
- datahub/ingestion/source/snowflake/snowflake_utils.py +9 -9
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +1 -1
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- datahub/sdk/search_filters.py +122 -1
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/sql_parsing/sqlglot_lineage.py +6 -1
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from functools import lru_cache
|
|
6
7
|
from typing import Dict, List, MutableMapping, Optional, Sequence, Set, Union, cast
|
|
7
8
|
|
|
@@ -31,6 +32,14 @@ from datahub.configuration.common import ConfigurationError
|
|
|
31
32
|
logger = logging.getLogger(__name__)
|
|
32
33
|
|
|
33
34
|
|
|
35
|
+
class LookerQueryResponseFormat(Enum):
|
|
36
|
+
# result_format - Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query
|
|
37
|
+
JSON = "json"
|
|
38
|
+
SQL = (
|
|
39
|
+
"sql" # Note: This does not execute the query, it only generates the SQL query.
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
34
43
|
class TransportOptionsConfig(ConfigModel):
|
|
35
44
|
timeout: int
|
|
36
45
|
headers: MutableMapping[str, str]
|
|
@@ -69,6 +78,7 @@ class LookerAPIStats(BaseModel):
|
|
|
69
78
|
search_looks_calls: int = 0
|
|
70
79
|
search_dashboards_calls: int = 0
|
|
71
80
|
all_user_calls: int = 0
|
|
81
|
+
generate_sql_query_calls: int = 0
|
|
72
82
|
|
|
73
83
|
|
|
74
84
|
class LookerAPI:
|
|
@@ -170,17 +180,40 @@ class LookerAPI:
|
|
|
170
180
|
logger.debug(f"Executing query {write_query}")
|
|
171
181
|
self.client_stats.query_calls += 1
|
|
172
182
|
|
|
173
|
-
|
|
174
|
-
result_format=
|
|
183
|
+
response = self.client.run_inline_query(
|
|
184
|
+
result_format=LookerQueryResponseFormat.JSON.value,
|
|
175
185
|
body=write_query,
|
|
176
186
|
transport_options=self.transport_options,
|
|
177
187
|
)
|
|
178
188
|
|
|
189
|
+
data = json.loads(response)
|
|
190
|
+
|
|
179
191
|
logger.debug("=================Response=================")
|
|
180
|
-
data = json.loads(response_json)
|
|
181
192
|
logger.debug("Length of response: %d", len(data))
|
|
182
193
|
return data
|
|
183
194
|
|
|
195
|
+
def generate_sql_query(
|
|
196
|
+
self, write_query: WriteQuery, use_cache: bool = False
|
|
197
|
+
) -> str:
|
|
198
|
+
"""
|
|
199
|
+
Generates a SQL query string for a given WriteQuery.
|
|
200
|
+
|
|
201
|
+
Note: This does not execute the query, it only generates the SQL query.
|
|
202
|
+
"""
|
|
203
|
+
logger.debug(f"Generating SQL query for {write_query}")
|
|
204
|
+
self.client_stats.generate_sql_query_calls += 1
|
|
205
|
+
|
|
206
|
+
response = self.client.run_inline_query(
|
|
207
|
+
result_format=LookerQueryResponseFormat.SQL.value,
|
|
208
|
+
body=write_query,
|
|
209
|
+
transport_options=self.transport_options,
|
|
210
|
+
cache=use_cache,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
logger.debug("=================Response=================")
|
|
214
|
+
logger.debug("Length of SQL response: %d", len(response))
|
|
215
|
+
return str(response)
|
|
216
|
+
|
|
184
217
|
def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboard:
|
|
185
218
|
self.client_stats.dashboard_calls += 1
|
|
186
219
|
return self.client.dashboard(
|
|
@@ -3,11 +3,11 @@ from typing import Dict, List, Optional
|
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.source.looker.looker_common import LookerViewId, ViewFieldValue
|
|
5
5
|
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
|
|
6
|
+
from datahub.ingestion.source.looker.looker_constant import NAME
|
|
6
7
|
from datahub.ingestion.source.looker.looker_dataclasses import LookerModel
|
|
7
8
|
from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
|
|
8
9
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
9
10
|
BASE_PROJECT_NAME,
|
|
10
|
-
NAME,
|
|
11
11
|
LookMLSourceReport,
|
|
12
12
|
)
|
|
13
13
|
|
|
@@ -12,12 +12,12 @@ from datahub.ingestion.source.looker.looker_constant import (
|
|
|
12
12
|
DIMENSION_GROUPS,
|
|
13
13
|
DIMENSIONS,
|
|
14
14
|
MEASURES,
|
|
15
|
+
NAME,
|
|
15
16
|
)
|
|
16
17
|
from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile
|
|
17
18
|
from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
|
|
18
19
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
19
20
|
DERIVED_VIEW_SUFFIX,
|
|
20
|
-
NAME,
|
|
21
21
|
LookMLSourceReport,
|
|
22
22
|
)
|
|
23
23
|
from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver
|
|
@@ -28,11 +28,10 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
28
28
|
StatefulIngestionConfigBase,
|
|
29
29
|
)
|
|
30
30
|
from datahub.utilities.lossy_collections import LossyList
|
|
31
|
+
from datahub.utilities.stats_collections import TopKDict, float_top_k_dict
|
|
31
32
|
|
|
32
33
|
logger = logging.getLogger(__name__)
|
|
33
34
|
|
|
34
|
-
NAME: str = "name"
|
|
35
|
-
|
|
36
35
|
BASE_PROJECT_NAME = "__BASE"
|
|
37
36
|
|
|
38
37
|
EXPLORE_FILE_EXTENSION = ".explore.lkml"
|
|
@@ -47,6 +46,9 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
|
|
|
47
46
|
@dataclass
|
|
48
47
|
class LookMLSourceReport(StaleEntityRemovalSourceReport):
|
|
49
48
|
git_clone_latency: Optional[timedelta] = None
|
|
49
|
+
looker_query_api_latency_seconds: TopKDict[str, float] = dataclass_field(
|
|
50
|
+
default_factory=float_top_k_dict
|
|
51
|
+
)
|
|
50
52
|
models_discovered: int = 0
|
|
51
53
|
models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
52
54
|
views_discovered: int = 0
|
|
@@ -81,6 +83,11 @@ class LookMLSourceReport(StaleEntityRemovalSourceReport):
|
|
|
81
83
|
self.api_stats = self._looker_api.compute_stats()
|
|
82
84
|
return super().compute_stats()
|
|
83
85
|
|
|
86
|
+
def report_looker_query_api_latency(
|
|
87
|
+
self, view_urn: str, latency: timedelta
|
|
88
|
+
) -> None:
|
|
89
|
+
self.looker_query_api_latency_seconds[view_urn] = latency.total_seconds()
|
|
90
|
+
|
|
84
91
|
|
|
85
92
|
class LookMLSourceConfig(
|
|
86
93
|
LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin
|
|
@@ -122,6 +129,16 @@ class LookMLSourceConfig(
|
|
|
122
129
|
description="List of regex patterns for LookML views to include in the extraction.",
|
|
123
130
|
)
|
|
124
131
|
parse_table_names_from_sql: bool = Field(True, description="See note below.")
|
|
132
|
+
use_api_for_view_lineage: bool = Field(
|
|
133
|
+
False,
|
|
134
|
+
description="When enabled, uses Looker API to get SQL representation of views for lineage parsing instead of parsing LookML files directly. Requires 'api' configuration to be provided."
|
|
135
|
+
"Coverage of regex based lineage extraction has limitations, it only supportes ${TABLE}.column_name syntax, See (https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions) to"
|
|
136
|
+
"understand the other substitutions and cross-references allowed in LookML.",
|
|
137
|
+
)
|
|
138
|
+
use_api_cache_for_view_lineage: bool = Field(
|
|
139
|
+
False,
|
|
140
|
+
description="When enabled, uses Looker API server-side caching for query execution. Requires 'api' configuration to be provided.",
|
|
141
|
+
)
|
|
125
142
|
api: Optional[LookerAPIConfig] = None
|
|
126
143
|
project_name: Optional[str] = Field(
|
|
127
144
|
None,
|
|
@@ -239,6 +256,17 @@ class LookMLSourceConfig(
|
|
|
239
256
|
)
|
|
240
257
|
return values
|
|
241
258
|
|
|
259
|
+
@root_validator(skip_on_failure=True)
|
|
260
|
+
def check_api_provided_for_view_lineage(cls, values):
|
|
261
|
+
"""Validate that we must have an api credential to use Looker API for view's column lineage"""
|
|
262
|
+
if not values.get("api") and values.get("use_api_for_view_lineage"):
|
|
263
|
+
raise ValueError(
|
|
264
|
+
"API credential was not found. LookML source requires api credentials "
|
|
265
|
+
"for Looker to use Looker APIs for view's column lineage extraction."
|
|
266
|
+
"Set `use_api_for_view_lineage` to False to skip using Looker APIs."
|
|
267
|
+
)
|
|
268
|
+
return values
|
|
269
|
+
|
|
242
270
|
@validator("base_folder", always=True)
|
|
243
271
|
def check_base_folder_if_not_provided(
|
|
244
272
|
cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any]
|
|
@@ -4,10 +4,10 @@ import logging
|
|
|
4
4
|
from typing import ClassVar, Dict, List, Set
|
|
5
5
|
|
|
6
6
|
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
|
|
7
|
+
from datahub.ingestion.source.looker.looker_constant import NAME
|
|
7
8
|
from datahub.ingestion.source.looker.looker_dataclasses import LookerModel
|
|
8
9
|
from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader
|
|
9
10
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
10
|
-
NAME,
|
|
11
11
|
LookMLSourceConfig,
|
|
12
12
|
LookMLSourceReport,
|
|
13
13
|
)
|
|
@@ -142,6 +142,8 @@ class LookerView:
|
|
|
142
142
|
ctx: PipelineContext,
|
|
143
143
|
extract_col_level_lineage: bool = False,
|
|
144
144
|
populate_sql_logic_in_descriptions: bool = False,
|
|
145
|
+
looker_client: Optional[LookerAPI] = None,
|
|
146
|
+
view_to_explore_map: Optional[Dict[str, str]] = None,
|
|
145
147
|
) -> Optional["LookerView"]:
|
|
146
148
|
view_name = view_context.name()
|
|
147
149
|
|
|
@@ -160,6 +162,8 @@ class LookerView:
|
|
|
160
162
|
config=config,
|
|
161
163
|
ctx=ctx,
|
|
162
164
|
reporter=reporter,
|
|
165
|
+
looker_client=looker_client,
|
|
166
|
+
view_to_explore_map=view_to_explore_map,
|
|
163
167
|
)
|
|
164
168
|
|
|
165
169
|
field_type_vs_raw_fields = OrderedDict(
|
|
@@ -705,6 +709,11 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
705
709
|
# Value: Tuple(model file name, connection name)
|
|
706
710
|
view_connection_map: Dict[str, Tuple[str, str]] = {}
|
|
707
711
|
|
|
712
|
+
# Map of view name to explore name for API-based view lineage
|
|
713
|
+
# A view can be referenced by multiple explores, we only need one of the explores to use Looker Query API
|
|
714
|
+
# Key: view_name, Value: explore_name
|
|
715
|
+
view_to_explore_map: Dict[str, str] = {}
|
|
716
|
+
|
|
708
717
|
# The ** means "this directory and all subdirectories", and hence should
|
|
709
718
|
# include all the files we want.
|
|
710
719
|
model_files = sorted(
|
|
@@ -759,37 +768,37 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
759
768
|
)
|
|
760
769
|
)
|
|
761
770
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
continue
|
|
771
|
+
model_explores_map = {d["name"]: d for d in model.explores}
|
|
772
|
+
for explore_dict in model.explores:
|
|
773
|
+
try:
|
|
774
|
+
if LookerRefinementResolver.is_refinement(explore_dict["name"]):
|
|
775
|
+
continue
|
|
768
776
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
for view_name in explore.upstream_views:
|
|
777
|
+
explore_dict = looker_refinement_resolver.apply_explore_refinement(
|
|
778
|
+
explore_dict
|
|
779
|
+
)
|
|
780
|
+
explore: LookerExplore = LookerExplore.from_dict(
|
|
781
|
+
model_name,
|
|
782
|
+
explore_dict,
|
|
783
|
+
model.resolved_includes,
|
|
784
|
+
viewfile_loader,
|
|
785
|
+
self.reporter,
|
|
786
|
+
model_explores_map,
|
|
787
|
+
)
|
|
788
|
+
if explore.upstream_views:
|
|
789
|
+
for view_name in explore.upstream_views:
|
|
790
|
+
if self.source_config.emit_reachable_views_only:
|
|
784
791
|
explore_reachable_views.add(view_name.include)
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
792
|
+
# Build view to explore mapping for API-based view lineage
|
|
793
|
+
view_to_explore_map[view_name.include] = explore.name
|
|
794
|
+
except Exception as e:
|
|
795
|
+
self.reporter.report_warning(
|
|
796
|
+
title="Failed to process explores",
|
|
797
|
+
message="Failed to process explore dictionary.",
|
|
798
|
+
context=f"Explore Details: {explore_dict}",
|
|
799
|
+
exc=e,
|
|
800
|
+
)
|
|
801
|
+
logger.debug("Failed to process explore", exc_info=e)
|
|
793
802
|
|
|
794
803
|
processed_view_files = processed_view_map.setdefault(
|
|
795
804
|
model.connection, set()
|
|
@@ -878,6 +887,10 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
878
887
|
populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions,
|
|
879
888
|
config=self.source_config,
|
|
880
889
|
ctx=self.ctx,
|
|
890
|
+
looker_client=self.looker_client,
|
|
891
|
+
view_to_explore_map=view_to_explore_map
|
|
892
|
+
if view_to_explore_map
|
|
893
|
+
else None,
|
|
881
894
|
)
|
|
882
895
|
except Exception as e:
|
|
883
896
|
self.reporter.report_warning(
|