acryl-datahub 1.2.0.11rc1__py3-none-any.whl → 1.2.0.11rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/METADATA +2685 -2685
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/RECORD +21 -19
- datahub/_version.py +1 -1
- datahub/configuration/validate_field_removal.py +3 -0
- datahub/ingestion/source/looker/looker_common.py +6 -0
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +30 -2
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +42 -29
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/sdk/search_filters.py +122 -1
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,33 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from typing import Dict, List, Optional
|
|
6
7
|
|
|
8
|
+
from looker_sdk.sdk.api40.models import (
|
|
9
|
+
WriteQuery,
|
|
10
|
+
)
|
|
11
|
+
|
|
7
12
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
8
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
9
14
|
from datahub.ingestion.source.looker.looker_common import (
|
|
10
15
|
LookerExplore,
|
|
11
16
|
LookerViewId,
|
|
12
17
|
ViewField,
|
|
18
|
+
ViewFieldDimensionGroupType,
|
|
13
19
|
ViewFieldType,
|
|
14
20
|
)
|
|
15
21
|
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
|
|
22
|
+
from datahub.ingestion.source.looker.looker_constant import (
|
|
23
|
+
NAME,
|
|
24
|
+
VIEW_FIELD_INTERVALS_ATTRIBUTE,
|
|
25
|
+
VIEW_FIELD_TIMEFRAMES_ATTRIBUTE,
|
|
26
|
+
VIEW_FIELD_TYPE_ATTRIBUTE,
|
|
27
|
+
)
|
|
28
|
+
from datahub.ingestion.source.looker.looker_lib_wrapper import (
|
|
29
|
+
LookerAPI,
|
|
30
|
+
)
|
|
16
31
|
from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache
|
|
17
32
|
from datahub.ingestion.source.looker.lookml_concept_context import (
|
|
18
33
|
LookerFieldContext,
|
|
@@ -20,7 +35,6 @@ from datahub.ingestion.source.looker.lookml_concept_context import (
|
|
|
20
35
|
)
|
|
21
36
|
from datahub.ingestion.source.looker.lookml_config import (
|
|
22
37
|
DERIVED_VIEW_SUFFIX,
|
|
23
|
-
NAME,
|
|
24
38
|
LookMLSourceConfig,
|
|
25
39
|
LookMLSourceReport,
|
|
26
40
|
)
|
|
@@ -280,6 +294,447 @@ class AbstractViewUpstream(ABC):
|
|
|
280
294
|
return upstream_column_refs
|
|
281
295
|
|
|
282
296
|
|
|
297
|
+
class LookerQueryAPIBasedViewUpstream(AbstractViewUpstream):
|
|
298
|
+
"""
|
|
299
|
+
Implements Looker view upstream lineage extraction using the Looker Query API.
|
|
300
|
+
|
|
301
|
+
This class leverages the Looker API to generate the fully resolved SQL for a Looker view by constructing a WriteQuery
|
|
302
|
+
that includes all dimensions, dimension groups and measures. The SQL is then parsed to extract column-level lineage.
|
|
303
|
+
The Looker client is required for this class, as it is used to execute the WriteQuery and retrieve the SQL.
|
|
304
|
+
|
|
305
|
+
Other view upstream implementations use string parsing to extract lineage information from the SQL, which does not cover all the edge cases.
|
|
306
|
+
Limitations of string based lineage extraction: Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions
|
|
307
|
+
|
|
308
|
+
Key Features:
|
|
309
|
+
- Requires a Looker client (`looker_client`) to execute queries and retrieve SQL for the view.
|
|
310
|
+
- Requires a `view_to_explore_map` to map view names to their corresponding explore name
|
|
311
|
+
- Field name translation is handled: Looker API field names are constructed as `<view_name>.<field_name>`, and helper
|
|
312
|
+
methods are provided to convert between Looker API field names and raw field names.
|
|
313
|
+
- SQL parsing is cached for efficiency, and the class is designed to gracefully fall back if the Looker Query API fails.
|
|
314
|
+
- All lineage extraction is based on the SQL returned by the Looker API, ensuring accurate and up-to-date lineage.
|
|
315
|
+
|
|
316
|
+
Why view_to_explore_map is required:
|
|
317
|
+
The Looker Query API expects the explore name (not the view name) as the "view" parameter in the WriteQuery.
|
|
318
|
+
In Looker, a view can be referenced by multiple explores, but the API needs any one of the
|
|
319
|
+
explores to access the view's fields
|
|
320
|
+
|
|
321
|
+
Example WriteQuery request (see `_execute_query` for details):
|
|
322
|
+
{
|
|
323
|
+
"model": "test_model",
|
|
324
|
+
"view": "users_explore", # This is the explore name, not the view name
|
|
325
|
+
"fields": [
|
|
326
|
+
"users.email", "users.lifetime_purchase_count"
|
|
327
|
+
],
|
|
328
|
+
"limit": "1",
|
|
329
|
+
"cache": true
|
|
330
|
+
}
|
|
331
|
+
The SQL response is then parsed to extract upstream tables and column-level lineage.
|
|
332
|
+
|
|
333
|
+
For further details, see the method-level docstrings, especially:
|
|
334
|
+
- `__get_spr`: SQL parsing and lineage extraction workflow
|
|
335
|
+
- `_get_sql_write_query`: WriteQuery construction and field enumeration
|
|
336
|
+
- `_execute_query`: Looker API invocation and SQL retrieval - this only generates the SQL query, does not execute it
|
|
337
|
+
- Field name translation: `_get_looker_api_field_name` and `_get_field_name_from_looker_api_field_name`
|
|
338
|
+
|
|
339
|
+
Note: This class is intended to be robust and raise exceptions if SQL parsing or API calls fail, and will fall back to
|
|
340
|
+
other implementations - custom regex-based parsing if necessary.
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(
|
|
344
|
+
self,
|
|
345
|
+
view_context: LookerViewContext,
|
|
346
|
+
looker_view_id_cache: LookerViewIdCache,
|
|
347
|
+
config: LookMLSourceConfig,
|
|
348
|
+
reporter: LookMLSourceReport,
|
|
349
|
+
ctx: PipelineContext,
|
|
350
|
+
looker_client: LookerAPI,
|
|
351
|
+
view_to_explore_map: Dict[str, str],
|
|
352
|
+
):
|
|
353
|
+
super().__init__(view_context, looker_view_id_cache, config, reporter, ctx)
|
|
354
|
+
self.looker_client = looker_client
|
|
355
|
+
self.view_to_explore_map = view_to_explore_map
|
|
356
|
+
# Cache the SQL parsing results
|
|
357
|
+
# We use maxsize=1 because a new class instance is created for each view, Ref: view_upstream.create_view_upstream
|
|
358
|
+
self._get_spr = lru_cache(maxsize=1)(self.__get_spr)
|
|
359
|
+
self._get_upstream_dataset_urn = lru_cache(maxsize=1)(
|
|
360
|
+
self.__get_upstream_dataset_urn
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Initialize the cache
|
|
364
|
+
# Done to fallback to other implementations if the Looker Query API fails
|
|
365
|
+
self._get_spr()
|
|
366
|
+
|
|
367
|
+
def __get_spr(self) -> SqlParsingResult:
|
|
368
|
+
"""
|
|
369
|
+
Retrieves the SQL parsing result for the current Looker view by:
|
|
370
|
+
1. Building a WriteQuery for the view.
|
|
371
|
+
2. Executing the query via the Looker API to get the SQL.
|
|
372
|
+
3. Parsing the SQL to extract lineage information.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
SqlParsingResult if successful, otherwise None.
|
|
376
|
+
Raises:
|
|
377
|
+
ValueError: If no SQL is found in the response.
|
|
378
|
+
ValueError: If no fields are found for the view.
|
|
379
|
+
ValueError: If explore name is not found for the view.
|
|
380
|
+
ValueError: If error in parsing SQL for upstream tables.
|
|
381
|
+
ValueError: If error in parsing SQL for column lineage.
|
|
382
|
+
"""
|
|
383
|
+
try:
|
|
384
|
+
# Build the WriteQuery for the current view.
|
|
385
|
+
sql_query: WriteQuery = self._get_sql_write_query()
|
|
386
|
+
|
|
387
|
+
# Execute the query to get the SQL representation from Looker.
|
|
388
|
+
sql_response = self._execute_query(sql_query)
|
|
389
|
+
|
|
390
|
+
# Parse the SQL to extract lineage information.
|
|
391
|
+
spr = create_lineage_sql_parsed_result(
|
|
392
|
+
query=sql_response,
|
|
393
|
+
default_schema=self.view_context.view_connection.default_schema,
|
|
394
|
+
default_db=self.view_context.view_connection.default_db,
|
|
395
|
+
platform=self.view_context.view_connection.platform,
|
|
396
|
+
platform_instance=self.view_context.view_connection.platform_instance,
|
|
397
|
+
env=self.view_context.view_connection.platform_env or self.config.env,
|
|
398
|
+
graph=self.ctx.graph,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Check for errors encountered during table extraction.
|
|
402
|
+
table_error = spr.debug_info.table_error
|
|
403
|
+
if table_error is not None:
|
|
404
|
+
self.reporter.report_warning(
|
|
405
|
+
title="Table Level Lineage Extraction Failed",
|
|
406
|
+
message="Error in parsing derived sql",
|
|
407
|
+
context=f"View-name: {self.view_context.name()}",
|
|
408
|
+
exc=table_error,
|
|
409
|
+
)
|
|
410
|
+
raise ValueError(
|
|
411
|
+
f"Error in parsing SQL for upstream tables: {table_error}"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
column_error = spr.debug_info.column_error
|
|
415
|
+
if column_error is not None:
|
|
416
|
+
self.reporter.report_warning(
|
|
417
|
+
title="Column Level Lineage Extraction Failed",
|
|
418
|
+
message="Error in parsing derived sql",
|
|
419
|
+
context=f"View-name: {self.view_context.name()}",
|
|
420
|
+
exc=column_error,
|
|
421
|
+
)
|
|
422
|
+
raise ValueError(
|
|
423
|
+
f"Error in parsing SQL for column lineage: {column_error}"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
return spr
|
|
427
|
+
except Exception:
|
|
428
|
+
# Reraise the exception to allow higher-level handling.
|
|
429
|
+
raise
|
|
430
|
+
|
|
431
|
+
def _get_time_dim_group_field_name(self, dim_group: dict) -> str:
|
|
432
|
+
"""
|
|
433
|
+
Time dimension groups must be referenced by their individual timeframes suffix.
|
|
434
|
+
Example:
|
|
435
|
+
dimension_group: created {
|
|
436
|
+
type: time
|
|
437
|
+
timeframes: [date, week, month]
|
|
438
|
+
sql: ${TABLE}.created_at ;;
|
|
439
|
+
}
|
|
440
|
+
Used as: {view_name.date_created}
|
|
441
|
+
|
|
442
|
+
created -> created_date, created_week, created_month
|
|
443
|
+
# Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#dimension_groups_must_be_referenced_by_their_individual_dimensions
|
|
444
|
+
"""
|
|
445
|
+
dim_group_name = dim_group.get(NAME)
|
|
446
|
+
timeframes = dim_group.get(VIEW_FIELD_TIMEFRAMES_ATTRIBUTE)
|
|
447
|
+
|
|
448
|
+
# If timeframes is not included (rare case), the dimension group will include all possible timeframes.
|
|
449
|
+
# We will pick to use "raw"
|
|
450
|
+
suffix = timeframes[0] if timeframes else "raw"
|
|
451
|
+
return f"{dim_group_name}_{suffix}"
|
|
452
|
+
|
|
453
|
+
def _get_duration_dim_group_field_name(self, dim_group: dict) -> str:
|
|
454
|
+
"""
|
|
455
|
+
Duration dimension groups must be referenced by their plural version of the interval value as prefix
|
|
456
|
+
Example:
|
|
457
|
+
dimension_group: since_event {
|
|
458
|
+
type: duration
|
|
459
|
+
intervals: [hour, day, week, month, quarter, year]
|
|
460
|
+
sql_start: ${faa_event_date_raw} ;;
|
|
461
|
+
sql_end: CURRENT_TIMESTAMP();;
|
|
462
|
+
}
|
|
463
|
+
Used as: {view_name.hours_since_event}
|
|
464
|
+
|
|
465
|
+
since_event -> hours_since_event, days_since_event, weeks_since_event, months_since_event, quarters_since_event, years_since_event
|
|
466
|
+
# Ref: https://cloud.google.com/looker/docs/reference/param-field-dimension-group#referencing_intervals_from_another_lookml_field
|
|
467
|
+
"""
|
|
468
|
+
dim_group_name = dim_group.get(NAME)
|
|
469
|
+
intervals = dim_group.get(VIEW_FIELD_INTERVALS_ATTRIBUTE)
|
|
470
|
+
|
|
471
|
+
# If intervals is not included (rare case), the dimension group will include all possible intervals.
|
|
472
|
+
# We will pick to use "day" -> "days"
|
|
473
|
+
prefix = f"{intervals[0]}s" if intervals else "days"
|
|
474
|
+
return f"{prefix}_{dim_group_name}"
|
|
475
|
+
|
|
476
|
+
def _get_sql_write_query(self) -> WriteQuery:
|
|
477
|
+
"""
|
|
478
|
+
Constructs a WriteQuery object to obtain the SQL representation of the current Looker view.
|
|
479
|
+
|
|
480
|
+
We need to list all the fields for the view to get the SQL representation of the view - this fully resolved SQL for view dimensions and measures.
|
|
481
|
+
|
|
482
|
+
The method uses the view_to_explore_map to determine the correct explore name to use in the WriteQuery.
|
|
483
|
+
This is crucial because the Looker Query API expects the explore name (not the view name) as the "view" parameter.
|
|
484
|
+
|
|
485
|
+
Ref: https://cloud.google.com/looker/docs/reference/param-field-sql#sql_for_dimensions
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
WriteQuery: The WriteQuery object if fields are found and explore name is available, otherwise None.
|
|
489
|
+
|
|
490
|
+
Raises:
|
|
491
|
+
ValueError: If the explore name is not found in the view_to_explore_map for the current view.
|
|
492
|
+
ValueError: If no fields are found for the view.
|
|
493
|
+
"""
|
|
494
|
+
|
|
495
|
+
# Collect all dimension and measure fields for the view.
|
|
496
|
+
view_fields: List[str] = []
|
|
497
|
+
# Add dimension fields in the format: <view_name>.<dimension_name> or <view_name>.<measure_name>
|
|
498
|
+
for field in self.view_context.dimensions() + self.view_context.measures():
|
|
499
|
+
field_name = field.get(NAME)
|
|
500
|
+
assert field_name # Happy linter
|
|
501
|
+
view_fields.append(self._get_looker_api_field_name(field_name))
|
|
502
|
+
|
|
503
|
+
for dim_group in self.view_context.dimension_groups():
|
|
504
|
+
dim_group_type: ViewFieldDimensionGroupType = ViewFieldDimensionGroupType(
|
|
505
|
+
dim_group.get(VIEW_FIELD_TYPE_ATTRIBUTE)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if dim_group_type == ViewFieldDimensionGroupType.TIME:
|
|
509
|
+
view_fields.append(
|
|
510
|
+
self._get_looker_api_field_name(
|
|
511
|
+
self._get_time_dim_group_field_name(dim_group)
|
|
512
|
+
)
|
|
513
|
+
)
|
|
514
|
+
elif dim_group_type == ViewFieldDimensionGroupType.DURATION:
|
|
515
|
+
view_fields.append(
|
|
516
|
+
self._get_looker_api_field_name(
|
|
517
|
+
self._get_duration_dim_group_field_name(dim_group)
|
|
518
|
+
)
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Use explore name from view_to_explore_map if available
|
|
522
|
+
# explore_name is always present in the view_to_explore_map because of the check in view_upstream.create_view_upstream
|
|
523
|
+
explore_name = self.view_to_explore_map.get(self.view_context.name())
|
|
524
|
+
assert explore_name # Happy linter
|
|
525
|
+
|
|
526
|
+
if not view_fields:
|
|
527
|
+
raise ValueError(
|
|
528
|
+
f"No fields found for view '{self.view_context.name()}'. Cannot proceed with Looker API for view lineage."
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
# Construct and return the WriteQuery object.
|
|
532
|
+
# The 'limit' is set to "1" as the query is only used to obtain SQL, not to fetch data.
|
|
533
|
+
return WriteQuery(
|
|
534
|
+
model=self.looker_view_id_cache.model_name,
|
|
535
|
+
view=explore_name,
|
|
536
|
+
fields=view_fields,
|
|
537
|
+
filters={},
|
|
538
|
+
limit="1",
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
def _execute_query(self, query: WriteQuery) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Executes a Looker SQL query using the Looker API and returns the SQL string.
|
|
544
|
+
|
|
545
|
+
Ref: https://cloud.google.com/looker/docs/reference/looker-api/latest/methods/Query/run_inline_query
|
|
546
|
+
|
|
547
|
+
Example Request:
|
|
548
|
+
WriteQuery:
|
|
549
|
+
{
|
|
550
|
+
"model": "test_model",
|
|
551
|
+
"view": "users",
|
|
552
|
+
"fields": [
|
|
553
|
+
"users.email", "users.lifetime_purchase_count"
|
|
554
|
+
],
|
|
555
|
+
"limit": "1",
|
|
556
|
+
"cache": true
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
Response:
|
|
560
|
+
"
|
|
561
|
+
SELECT
|
|
562
|
+
users."EMAIL" AS "users.email",
|
|
563
|
+
COUNT(DISTINCT ( purchases."PK" ) ) AS "users.lifetime_purchase_count"
|
|
564
|
+
FROM "ECOMMERCE"."USERS" AS users
|
|
565
|
+
LEFT JOIN "ECOMMERCE"."PURCHASES" AS purchases ON (users."PK") = (purchases."USER_FK")
|
|
566
|
+
GROUP BY
|
|
567
|
+
1
|
|
568
|
+
ORDER BY
|
|
569
|
+
2 DESC
|
|
570
|
+
FETCH NEXT 1 ROWS ONLY
|
|
571
|
+
"
|
|
572
|
+
Args:
|
|
573
|
+
query (WriteQuery): The Looker WriteQuery object to execute.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
str: The SQL string returned by the Looker API, or an empty string if execution fails.
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
# Record the start time for latency measurement.
|
|
580
|
+
start_time = datetime.now()
|
|
581
|
+
|
|
582
|
+
# Execute the query using the Looker client.
|
|
583
|
+
sql_response = self.looker_client.generate_sql_query(
|
|
584
|
+
write_query=query, use_cache=self.config.use_api_cache_for_view_lineage
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Record the end time after query execution.
|
|
588
|
+
end_time = datetime.now()
|
|
589
|
+
|
|
590
|
+
# Attempt to get the LookerViewId for reporting.
|
|
591
|
+
looker_view_id: Optional[LookerViewId] = (
|
|
592
|
+
self.looker_view_id_cache.get_looker_view_id(
|
|
593
|
+
view_name=self.view_context.name(),
|
|
594
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
595
|
+
)
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
# Report the query API latency if the view ID is available.
|
|
599
|
+
if looker_view_id is not None:
|
|
600
|
+
self.reporter.report_looker_query_api_latency(
|
|
601
|
+
looker_view_id.get_urn(self.config),
|
|
602
|
+
end_time - start_time,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
# Validate the response structure.
|
|
606
|
+
if not sql_response:
|
|
607
|
+
raise ValueError(
|
|
608
|
+
f"No SQL found in response for view '{self.view_context.name()}'. Response: {sql_response}"
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
# Extract the SQL string from the response.
|
|
612
|
+
return sql_response
|
|
613
|
+
|
|
614
|
+
def __get_upstream_dataset_urn(self) -> List[Urn]:
|
|
615
|
+
"""
|
|
616
|
+
Extract upstream dataset URNs by parsing the SQL for the current view.
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
List[Urn]: List of upstream dataset URNs, or an empty list if parsing fails.
|
|
620
|
+
"""
|
|
621
|
+
# Attempt to get the SQL parsing result for the current view.
|
|
622
|
+
spr: SqlParsingResult = self._get_spr()
|
|
623
|
+
|
|
624
|
+
# Remove any 'hive.' prefix from upstream table URNs.
|
|
625
|
+
upstream_dataset_urns: List[str] = [
|
|
626
|
+
_drop_hive_dot(urn) for urn in spr.in_tables
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
# Fix any derived view references present in the URNs.
|
|
630
|
+
upstream_dataset_urns = fix_derived_view_urn(
|
|
631
|
+
urns=upstream_dataset_urns,
|
|
632
|
+
looker_view_id_cache=self.looker_view_id_cache,
|
|
633
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
634
|
+
config=self.config,
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
return upstream_dataset_urns
|
|
638
|
+
|
|
639
|
+
def _get_looker_api_field_name(self, field_name: str) -> str:
|
|
640
|
+
"""
|
|
641
|
+
Translate the field name to the looker api field name
|
|
642
|
+
|
|
643
|
+
Example:
|
|
644
|
+
pk -> purchases.pk
|
|
645
|
+
"""
|
|
646
|
+
return f"{self.view_context.name()}.{field_name}"
|
|
647
|
+
|
|
648
|
+
def _get_field_name_from_looker_api_field_name(
|
|
649
|
+
self, looker_api_field_name: str
|
|
650
|
+
) -> str:
|
|
651
|
+
"""
|
|
652
|
+
Translate the looker api field name to the field name
|
|
653
|
+
|
|
654
|
+
Example:
|
|
655
|
+
purchases.pk -> pk
|
|
656
|
+
"""
|
|
657
|
+
# Remove the view name at the start and the dot from the looker_api_field_name, but only if it matches the current view name
|
|
658
|
+
prefix = f"{self.view_context.name()}."
|
|
659
|
+
if looker_api_field_name.startswith(prefix):
|
|
660
|
+
return looker_api_field_name[len(prefix) :]
|
|
661
|
+
else:
|
|
662
|
+
# Don't throw an error, just return the original field name
|
|
663
|
+
return looker_api_field_name
|
|
664
|
+
|
|
665
|
+
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
666
|
+
"""Get upstream dataset URNs"""
|
|
667
|
+
return self._get_upstream_dataset_urn()
|
|
668
|
+
|
|
669
|
+
def get_upstream_column_ref(
|
|
670
|
+
self, field_context: LookerFieldContext
|
|
671
|
+
) -> List[ColumnRef]:
|
|
672
|
+
"""Return upstream column references for a given field."""
|
|
673
|
+
spr: SqlParsingResult = self._get_spr()
|
|
674
|
+
if not spr.column_lineage:
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
field_type: Optional[ViewFieldDimensionGroupType] = None
|
|
678
|
+
field_name = field_context.name()
|
|
679
|
+
try:
|
|
680
|
+
# Try if field is a dimension group
|
|
681
|
+
field_type = ViewFieldDimensionGroupType(
|
|
682
|
+
field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE)
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if field_type == ViewFieldDimensionGroupType.TIME:
|
|
686
|
+
field_name = self._get_time_dim_group_field_name(
|
|
687
|
+
field_context.raw_field
|
|
688
|
+
)
|
|
689
|
+
elif field_type == ViewFieldDimensionGroupType.DURATION:
|
|
690
|
+
field_name = self._get_duration_dim_group_field_name(
|
|
691
|
+
field_context.raw_field
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
except Exception:
|
|
695
|
+
# Not a dimension group, no modification needed
|
|
696
|
+
logger.debug(
|
|
697
|
+
f"view-name={self.view_context.name()}, field-name={field_name}, field-type={field_context.raw_field.get(VIEW_FIELD_TYPE_ATTRIBUTE)}"
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
field_api_name = self._get_looker_api_field_name(field_name).lower()
|
|
701
|
+
|
|
702
|
+
upstream_refs: List[ColumnRef] = []
|
|
703
|
+
|
|
704
|
+
for lineage in spr.column_lineage:
|
|
705
|
+
if lineage.downstream.column.lower() == field_api_name:
|
|
706
|
+
for upstream in lineage.upstreams:
|
|
707
|
+
upstream_refs.append(
|
|
708
|
+
ColumnRef(table=upstream.table, column=upstream.column)
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
return _drop_hive_dot_from_upstream(upstream_refs)
|
|
712
|
+
|
|
713
|
+
def create_fields(self) -> List[ViewField]:
|
|
714
|
+
"""Create ViewField objects from SQL parsing result."""
|
|
715
|
+
spr: SqlParsingResult = self._get_spr()
|
|
716
|
+
|
|
717
|
+
if not spr.column_lineage:
|
|
718
|
+
return []
|
|
719
|
+
|
|
720
|
+
fields: List[ViewField] = []
|
|
721
|
+
|
|
722
|
+
for lineage in spr.column_lineage:
|
|
723
|
+
fields.append(
|
|
724
|
+
ViewField(
|
|
725
|
+
name=self._get_field_name_from_looker_api_field_name(
|
|
726
|
+
lineage.downstream.column
|
|
727
|
+
),
|
|
728
|
+
label="",
|
|
729
|
+
type=lineage.downstream.native_column_type or "unknown",
|
|
730
|
+
description="",
|
|
731
|
+
field_type=ViewFieldType.UNKNOWN,
|
|
732
|
+
upstream_fields=_drop_hive_dot_from_upstream(lineage.upstreams),
|
|
733
|
+
)
|
|
734
|
+
)
|
|
735
|
+
return fields
|
|
736
|
+
|
|
737
|
+
|
|
283
738
|
class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
284
739
|
"""
|
|
285
740
|
Handle the case where upstream dataset is defined in derived_table.sql
|
|
@@ -674,7 +1129,45 @@ def create_view_upstream(
|
|
|
674
1129
|
config: LookMLSourceConfig,
|
|
675
1130
|
ctx: PipelineContext,
|
|
676
1131
|
reporter: LookMLSourceReport,
|
|
1132
|
+
looker_client: Optional["LookerAPI"] = None,
|
|
1133
|
+
view_to_explore_map: Optional[Dict[str, str]] = None,
|
|
677
1134
|
) -> AbstractViewUpstream:
|
|
1135
|
+
# Looker client is required for LookerQueryAPIBasedViewUpstream also enforced by config.use_api_for_view_lineage
|
|
1136
|
+
# view_to_explore_map is required for Looker query API args
|
|
1137
|
+
# Only process if view exists in view_to_explore_map, because we cannot query views which are not reachable from an explore
|
|
1138
|
+
if (
|
|
1139
|
+
config.use_api_for_view_lineage
|
|
1140
|
+
and looker_client
|
|
1141
|
+
and view_to_explore_map
|
|
1142
|
+
and view_context.name() in view_to_explore_map
|
|
1143
|
+
):
|
|
1144
|
+
try:
|
|
1145
|
+
return LookerQueryAPIBasedViewUpstream(
|
|
1146
|
+
view_context=view_context,
|
|
1147
|
+
config=config,
|
|
1148
|
+
reporter=reporter,
|
|
1149
|
+
ctx=ctx,
|
|
1150
|
+
looker_view_id_cache=looker_view_id_cache,
|
|
1151
|
+
looker_client=looker_client,
|
|
1152
|
+
view_to_explore_map=view_to_explore_map,
|
|
1153
|
+
)
|
|
1154
|
+
except Exception as e:
|
|
1155
|
+
# Falling back to custom regex-based parsing - best effort approach
|
|
1156
|
+
reporter.report_warning(
|
|
1157
|
+
title="Looker Query API based View Upstream Failed",
|
|
1158
|
+
message="Error in getting upstream lineage for view using Looker Query API",
|
|
1159
|
+
context=f"View-name: {view_context.name()}",
|
|
1160
|
+
exc=e,
|
|
1161
|
+
)
|
|
1162
|
+
else:
|
|
1163
|
+
logger.debug(
|
|
1164
|
+
f"Skipping Looker Query API for view: {view_context.name()} because one or more conditions are not met: "
|
|
1165
|
+
f"use_api_for_view_lineage={config.use_api_for_view_lineage}, "
|
|
1166
|
+
f"looker_client={'set' if looker_client else 'not set'}, "
|
|
1167
|
+
f"view_to_explore_map={'set' if view_to_explore_map else 'not set'}, "
|
|
1168
|
+
f"view_in_view_to_explore_map={view_context.name() in view_to_explore_map if view_to_explore_map else False}"
|
|
1169
|
+
)
|
|
1170
|
+
|
|
678
1171
|
if view_context.is_regular_case():
|
|
679
1172
|
return RegularViewUpstream(
|
|
680
1173
|
view_context=view_context,
|
datahub/sdk/search_filters.py
CHANGED
|
@@ -30,7 +30,14 @@ from datahub.ingestion.graph.filters import (
|
|
|
30
30
|
_get_status_filter,
|
|
31
31
|
)
|
|
32
32
|
from datahub.metadata.schema_classes import EntityTypeName
|
|
33
|
-
from datahub.metadata.urns import
|
|
33
|
+
from datahub.metadata.urns import (
|
|
34
|
+
ContainerUrn,
|
|
35
|
+
CorpGroupUrn,
|
|
36
|
+
CorpUserUrn,
|
|
37
|
+
DataPlatformUrn,
|
|
38
|
+
DomainUrn,
|
|
39
|
+
)
|
|
40
|
+
from datahub.utilities.urns.urn import guess_entity_type
|
|
34
41
|
|
|
35
42
|
_AndSearchFilterRule = TypedDict(
|
|
36
43
|
"_AndSearchFilterRule", {"and": List[SearchFilterRule]}
|
|
@@ -235,6 +242,94 @@ class _EnvFilter(_BaseFilter):
|
|
|
235
242
|
]
|
|
236
243
|
|
|
237
244
|
|
|
245
|
+
class _OwnerFilter(_BaseFilter):
|
|
246
|
+
"""Filter for entities owned by specific users or groups."""
|
|
247
|
+
|
|
248
|
+
owner: List[str] = pydantic.Field(
|
|
249
|
+
description="The owner to filter on. Should be user or group URNs.",
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
@pydantic.validator("owner", each_item=True)
|
|
253
|
+
def validate_owner(cls, v: str) -> str:
|
|
254
|
+
if not v.startswith("urn:li:"):
|
|
255
|
+
raise ValueError(f"Owner must be a valid User or Group URN, got: {v}")
|
|
256
|
+
_type = guess_entity_type(v)
|
|
257
|
+
if _type == CorpUserUrn.ENTITY_TYPE:
|
|
258
|
+
return str(CorpUserUrn.from_string(v))
|
|
259
|
+
elif _type == CorpGroupUrn.ENTITY_TYPE:
|
|
260
|
+
return str(CorpGroupUrn.from_string(v))
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError(f"Owner must be a valid User or Group URN, got: {v}")
|
|
263
|
+
|
|
264
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
265
|
+
return SearchFilterRule(
|
|
266
|
+
field="owners",
|
|
267
|
+
condition="EQUAL",
|
|
268
|
+
values=self.owner,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
def compile(self) -> _OrFilters:
|
|
272
|
+
return [{"and": [self._build_rule()]}]
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class _GlossaryTermFilter(_BaseFilter):
|
|
276
|
+
"""Filter for entities associated with specific glossary terms."""
|
|
277
|
+
|
|
278
|
+
glossary_term: List[str] = pydantic.Field(
|
|
279
|
+
description="The glossary term to filter on. Should be glossary term URNs.",
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
@pydantic.validator("glossary_term", each_item=True)
|
|
283
|
+
def validate_glossary_term(cls, v: str) -> str:
|
|
284
|
+
if not v.startswith("urn:li:"):
|
|
285
|
+
raise ValueError(f"Glossary term must be a valid URN, got: {v}")
|
|
286
|
+
# Validate that it's a glossary term URN
|
|
287
|
+
_type = guess_entity_type(v)
|
|
288
|
+
if _type != "glossaryTerm":
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Glossary term must be a valid glossary term URN, got: {v}"
|
|
291
|
+
)
|
|
292
|
+
return v
|
|
293
|
+
|
|
294
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
295
|
+
return SearchFilterRule(
|
|
296
|
+
field="glossaryTerms",
|
|
297
|
+
condition="EQUAL",
|
|
298
|
+
values=self.glossary_term,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def compile(self) -> _OrFilters:
|
|
302
|
+
return [{"and": [self._build_rule()]}]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class _TagFilter(_BaseFilter):
|
|
306
|
+
"""Filter for entities associated with specific tags."""
|
|
307
|
+
|
|
308
|
+
tag: List[str] = pydantic.Field(
|
|
309
|
+
description="The tag to filter on. Should be tag URNs.",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
@pydantic.validator("tag", each_item=True)
|
|
313
|
+
def validate_tag(cls, v: str) -> str:
|
|
314
|
+
if not v.startswith("urn:li:"):
|
|
315
|
+
raise ValueError(f"Tag must be a valid URN, got: {v}")
|
|
316
|
+
# Validate that it's a tag URN
|
|
317
|
+
_type = guess_entity_type(v)
|
|
318
|
+
if _type != "tag":
|
|
319
|
+
raise ValueError(f"Tag must be a valid tag URN, got: {v}")
|
|
320
|
+
return v
|
|
321
|
+
|
|
322
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
323
|
+
return SearchFilterRule(
|
|
324
|
+
field="tags",
|
|
325
|
+
condition="EQUAL",
|
|
326
|
+
values=self.tag,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def compile(self) -> _OrFilters:
|
|
330
|
+
return [{"and": [self._build_rule()]}]
|
|
331
|
+
|
|
332
|
+
|
|
238
333
|
class _CustomCondition(_BaseFilter):
|
|
239
334
|
"""Represents a single field condition."""
|
|
240
335
|
|
|
@@ -407,6 +502,9 @@ if TYPE_CHECKING or not PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR:
|
|
|
407
502
|
_DomainFilter,
|
|
408
503
|
_ContainerFilter,
|
|
409
504
|
_EnvFilter,
|
|
505
|
+
_OwnerFilter,
|
|
506
|
+
_GlossaryTermFilter,
|
|
507
|
+
_TagFilter,
|
|
410
508
|
_CustomCondition,
|
|
411
509
|
]
|
|
412
510
|
|
|
@@ -448,6 +546,11 @@ else:
|
|
|
448
546
|
_ContainerFilter, Tag(_ContainerFilter._field_discriminator())
|
|
449
547
|
],
|
|
450
548
|
Annotated[_EnvFilter, Tag(_EnvFilter._field_discriminator())],
|
|
549
|
+
Annotated[_OwnerFilter, Tag(_OwnerFilter._field_discriminator())],
|
|
550
|
+
Annotated[
|
|
551
|
+
_GlossaryTermFilter, Tag(_GlossaryTermFilter._field_discriminator())
|
|
552
|
+
],
|
|
553
|
+
Annotated[_TagFilter, Tag(_TagFilter._field_discriminator())],
|
|
451
554
|
Annotated[
|
|
452
555
|
_CustomCondition, Tag(_CustomCondition._field_discriminator())
|
|
453
556
|
],
|
|
@@ -551,6 +654,24 @@ class FilterDsl:
|
|
|
551
654
|
def env(env: Union[str, Sequence[str]], /) -> _EnvFilter:
|
|
552
655
|
return _EnvFilter(env=[env] if isinstance(env, str) else env)
|
|
553
656
|
|
|
657
|
+
@staticmethod
|
|
658
|
+
def owner(owner: Union[str, Sequence[str]], /) -> _OwnerFilter:
|
|
659
|
+
return _OwnerFilter(owner=[owner] if isinstance(owner, str) else owner)
|
|
660
|
+
|
|
661
|
+
@staticmethod
|
|
662
|
+
def glossary_term(
|
|
663
|
+
glossary_term: Union[str, Sequence[str]], /
|
|
664
|
+
) -> _GlossaryTermFilter:
|
|
665
|
+
return _GlossaryTermFilter(
|
|
666
|
+
glossary_term=[glossary_term]
|
|
667
|
+
if isinstance(glossary_term, str)
|
|
668
|
+
else glossary_term
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
@staticmethod
|
|
672
|
+
def tag(tag: Union[str, Sequence[str]], /) -> _TagFilter:
|
|
673
|
+
return _TagFilter(tag=[tag] if isinstance(tag, str) else tag)
|
|
674
|
+
|
|
554
675
|
@staticmethod
|
|
555
676
|
def has_custom_property(key: str, value: str) -> _CustomCondition:
|
|
556
677
|
return _CustomCondition(
|