acryl-datahub 0.15.0rc16__py3-none-any.whl → 0.15.0rc17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc17.dist-info}/METADATA +2512 -2512
- {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc17.dist-info}/RECORD +33 -31
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +7 -5
- datahub/cli/delete_cli.py +66 -20
- datahub/configuration/common.py +3 -3
- datahub/ingestion/api/source.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +1 -1
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/source/kafka/kafka.py +18 -11
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +12 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +17 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +45 -5
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +35 -16
- datahub/ingestion/source/tableau/tableau_common.py +0 -1
- datahub/metadata/_schema_classes.py +122 -2
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +73 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sqlglot_lineage.py +48 -13
- datahub/testing/doctest.py +12 -0
- {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc17.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc17.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc17.dist-info}/top_level.txt +0 -0
|
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
|
|
|
25
25
|
LookMLSourceReport,
|
|
26
26
|
)
|
|
27
27
|
from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
|
|
28
|
+
from datahub.sql_parsing.schema_resolver import match_columns_to_schema
|
|
28
29
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
29
30
|
ColumnLineageInfo,
|
|
30
31
|
ColumnRef,
|
|
31
32
|
SqlParsingResult,
|
|
32
33
|
Urn,
|
|
34
|
+
create_and_cache_schema_resolver,
|
|
33
35
|
create_lineage_sql_parsed_result,
|
|
34
36
|
)
|
|
35
37
|
|
|
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
|
|
|
200
202
|
class AbstractViewUpstream(ABC):
|
|
201
203
|
"""
|
|
202
204
|
Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
|
|
203
|
-
For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
|
|
205
|
+
For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
|
|
204
206
|
"""
|
|
205
207
|
|
|
206
208
|
view_context: LookerViewContext
|
|
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
|
|
|
236
238
|
def create_fields(self) -> List[ViewField]:
|
|
237
239
|
return [] # it is for the special case
|
|
238
240
|
|
|
241
|
+
def create_upstream_column_refs(
|
|
242
|
+
self, upstream_urn: str, downstream_looker_columns: List[str]
|
|
243
|
+
) -> List[ColumnRef]:
|
|
244
|
+
"""
|
|
245
|
+
- **`upstream_urn`**: The URN of the upstream dataset.
|
|
246
|
+
|
|
247
|
+
- **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
|
|
248
|
+
|
|
249
|
+
- This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
|
|
250
|
+
"""
|
|
251
|
+
schema_resolver = create_and_cache_schema_resolver(
|
|
252
|
+
platform=self.view_context.view_connection.platform,
|
|
253
|
+
platform_instance=self.view_context.view_connection.platform_instance,
|
|
254
|
+
env=self.view_context.view_connection.platform_env or self.config.env,
|
|
255
|
+
graph=self.ctx.graph,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
|
|
259
|
+
|
|
260
|
+
if schema_info:
|
|
261
|
+
actual_columns = match_columns_to_schema(
|
|
262
|
+
schema_info, downstream_looker_columns
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
|
|
267
|
+
)
|
|
268
|
+
actual_columns = [column.lower() for column in downstream_looker_columns]
|
|
269
|
+
|
|
270
|
+
upstream_column_refs: List[ColumnRef] = []
|
|
271
|
+
|
|
272
|
+
for column in actual_columns:
|
|
273
|
+
upstream_column_refs.append(
|
|
274
|
+
ColumnRef(
|
|
275
|
+
column=column,
|
|
276
|
+
table=upstream_urn,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return upstream_column_refs
|
|
281
|
+
|
|
239
282
|
|
|
240
283
|
class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
241
284
|
"""
|
|
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
|
|
|
372
415
|
# in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
|
|
373
416
|
# referring to upstream table
|
|
374
417
|
if self._get_upstream_dataset_urn() and not upstreams_column_refs:
|
|
375
|
-
upstreams_column_refs =
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
)
|
|
382
|
-
for column in field_context.column_name_in_sql_attribute()
|
|
383
|
-
]
|
|
418
|
+
upstreams_column_refs = self.create_upstream_column_refs(
|
|
419
|
+
upstream_urn=self._get_upstream_dataset_urn()[
|
|
420
|
+
0
|
|
421
|
+
], # 0th index has table of from clause,
|
|
422
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
423
|
+
)
|
|
384
424
|
|
|
385
425
|
# fix any derived view reference present in urn
|
|
386
426
|
upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
|
|
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
487
527
|
return upstream_column_refs
|
|
488
528
|
|
|
489
529
|
explore_urn: str = self._get_upstream_dataset_urn()[0]
|
|
530
|
+
expected_columns: List[str] = []
|
|
490
531
|
|
|
491
532
|
for column in field_context.column_name_in_sql_attribute():
|
|
492
533
|
if column in self._get_explore_column_mapping():
|
|
493
534
|
explore_column: Dict = self._get_explore_column_mapping()[column]
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
column=explore_column.get("field", explore_column[NAME]),
|
|
497
|
-
table=explore_urn,
|
|
498
|
-
)
|
|
535
|
+
expected_columns.append(
|
|
536
|
+
explore_column.get("field", explore_column[NAME])
|
|
499
537
|
)
|
|
500
538
|
|
|
501
|
-
return
|
|
539
|
+
return self.create_upstream_column_refs(
|
|
540
|
+
upstream_urn=explore_urn, downstream_looker_columns=expected_columns
|
|
541
|
+
)
|
|
502
542
|
|
|
503
543
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
504
544
|
return self._get_upstream_dataset_urn()
|
|
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
|
|
|
548
588
|
def get_upstream_column_ref(
|
|
549
589
|
self, field_context: LookerFieldContext
|
|
550
590
|
) -> List[ColumnRef]:
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
|
|
556
|
-
)
|
|
557
|
-
|
|
558
|
-
return upstream_column_ref
|
|
591
|
+
return self.create_upstream_column_refs(
|
|
592
|
+
upstream_urn=self._get_upstream_dataset_urn(),
|
|
593
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
594
|
+
)
|
|
559
595
|
|
|
560
596
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
561
597
|
return [self._get_upstream_dataset_urn()]
|
|
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
|
|
|
609
645
|
self, field_context: LookerFieldContext
|
|
610
646
|
) -> List[ColumnRef]:
|
|
611
647
|
upstream_column_ref: List[ColumnRef] = []
|
|
648
|
+
|
|
612
649
|
if not self._get_upstream_dataset_urn():
|
|
613
650
|
return upstream_column_ref
|
|
614
651
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
return upstream_column_ref
|
|
652
|
+
return self.create_upstream_column_refs(
|
|
653
|
+
upstream_urn=self._get_upstream_dataset_urn()[0],
|
|
654
|
+
downstream_looker_columns=field_context.column_name_in_sql_attribute(),
|
|
655
|
+
)
|
|
621
656
|
|
|
622
657
|
def get_upstream_dataset_urn(self) -> List[Urn]:
|
|
623
658
|
return self._get_upstream_dataset_urn()
|
|
@@ -129,7 +129,9 @@ class SnowflakeQuery:
|
|
|
129
129
|
row_count AS "ROW_COUNT",
|
|
130
130
|
bytes AS "BYTES",
|
|
131
131
|
clustering_key AS "CLUSTERING_KEY",
|
|
132
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
132
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
133
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
134
|
+
is_iceberg AS "IS_ICEBERG"
|
|
133
135
|
FROM {db_clause}information_schema.tables t
|
|
134
136
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
135
137
|
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
@@ -149,7 +151,9 @@ class SnowflakeQuery:
|
|
|
149
151
|
row_count AS "ROW_COUNT",
|
|
150
152
|
bytes AS "BYTES",
|
|
151
153
|
clustering_key AS "CLUSTERING_KEY",
|
|
152
|
-
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
154
|
+
auto_clustering_on AS "AUTO_CLUSTERING_ON",
|
|
155
|
+
is_dynamic AS "IS_DYNAMIC",
|
|
156
|
+
is_iceberg AS "IS_ICEBERG"
|
|
153
157
|
FROM {db_clause}information_schema.tables t
|
|
154
158
|
where table_schema='{schema_name}'
|
|
155
159
|
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
|
|
|
113
113
|
external_lineage_queries_secs: float = -1
|
|
114
114
|
num_tables_with_known_upstreams: int = 0
|
|
115
115
|
num_upstream_lineage_edge_parsing_failed: int = 0
|
|
116
|
+
num_secure_views_missing_definition: int = 0
|
|
116
117
|
|
|
117
118
|
data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
|
|
118
119
|
|
|
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
|
|
|
90
90
|
foreign_keys: List[SnowflakeFK] = field(default_factory=list)
|
|
91
91
|
tags: Optional[List[SnowflakeTag]] = None
|
|
92
92
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
93
|
+
is_dynamic: bool = False
|
|
94
|
+
is_iceberg: bool = False
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def is_hybrid(self) -> bool:
|
|
98
|
+
return self.type is not None and self.type == "HYBRID TABLE"
|
|
93
99
|
|
|
94
100
|
|
|
95
101
|
@dataclass
|
|
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
|
|
|
98
104
|
columns: List[SnowflakeColumn] = field(default_factory=list)
|
|
99
105
|
tags: Optional[List[SnowflakeTag]] = None
|
|
100
106
|
column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
|
|
107
|
+
is_secure: bool = False
|
|
101
108
|
|
|
102
109
|
|
|
103
110
|
@dataclass
|
|
@@ -289,6 +296,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
289
296
|
rows_count=table["ROW_COUNT"],
|
|
290
297
|
comment=table["COMMENT"],
|
|
291
298
|
clustering_key=table["CLUSTERING_KEY"],
|
|
299
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
300
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
292
301
|
)
|
|
293
302
|
)
|
|
294
303
|
return tables
|
|
@@ -313,6 +322,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
313
322
|
rows_count=table["ROW_COUNT"],
|
|
314
323
|
comment=table["COMMENT"],
|
|
315
324
|
clustering_key=table["CLUSTERING_KEY"],
|
|
325
|
+
is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
|
|
326
|
+
is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
|
|
316
327
|
)
|
|
317
328
|
)
|
|
318
329
|
return tables
|
|
@@ -356,6 +367,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
356
367
|
materialized=(
|
|
357
368
|
view.get("is_materialized", "false").lower() == "true"
|
|
358
369
|
),
|
|
370
|
+
is_secure=(view.get("is_secure", "false").lower() == "true"),
|
|
359
371
|
)
|
|
360
372
|
)
|
|
361
373
|
|
|
@@ -431,6 +431,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
431
431
|
default_db=db_name,
|
|
432
432
|
default_schema=schema_name,
|
|
433
433
|
)
|
|
434
|
+
elif view.is_secure:
|
|
435
|
+
self.report.num_secure_views_missing_definition += 1
|
|
434
436
|
|
|
435
437
|
if self.config.include_technical_schema:
|
|
436
438
|
for view in views:
|
|
@@ -749,8 +751,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
749
751
|
) -> DatasetProperties:
|
|
750
752
|
custom_properties = {}
|
|
751
753
|
|
|
752
|
-
if isinstance(table, SnowflakeTable)
|
|
753
|
-
|
|
754
|
+
if isinstance(table, SnowflakeTable):
|
|
755
|
+
if table.clustering_key:
|
|
756
|
+
custom_properties["CLUSTERING_KEY"] = table.clustering_key
|
|
757
|
+
|
|
758
|
+
if table.is_hybrid:
|
|
759
|
+
custom_properties["IS_HYBRID"] = "true"
|
|
760
|
+
|
|
761
|
+
if table.is_dynamic:
|
|
762
|
+
custom_properties["IS_DYNAMIC"] = "true"
|
|
763
|
+
|
|
764
|
+
if table.is_iceberg:
|
|
765
|
+
custom_properties["IS_ICEBERG"] = "true"
|
|
766
|
+
|
|
767
|
+
if isinstance(table, SnowflakeView) and table.is_secure:
|
|
768
|
+
custom_properties["IS_SECURE"] = "true"
|
|
754
769
|
|
|
755
770
|
return DatasetProperties(
|
|
756
771
|
name=table.name,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from functools import cached_property
|
|
3
|
-
from typing import ClassVar, Literal, Optional, Tuple
|
|
3
|
+
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
6
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
184
184
|
return table_name.lower().startswith("sys$")
|
|
185
185
|
|
|
186
186
|
|
|
187
|
+
def _split_qualified_name(qualified_name: str) -> List[str]:
|
|
188
|
+
"""
|
|
189
|
+
Split a qualified name into its constituent parts.
|
|
190
|
+
|
|
191
|
+
>>> _split_qualified_name("db.my_schema.my_table")
|
|
192
|
+
['db', 'my_schema', 'my_table']
|
|
193
|
+
>>> _split_qualified_name('"db"."my_schema"."my_table"')
|
|
194
|
+
['db', 'my_schema', 'my_table']
|
|
195
|
+
>>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
|
|
196
|
+
['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
|
|
197
|
+
>>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
|
|
198
|
+
['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# Fast path - no quotes.
|
|
202
|
+
if '"' not in qualified_name:
|
|
203
|
+
return qualified_name.split(".")
|
|
204
|
+
|
|
205
|
+
# First pass - split on dots that are not inside quotes.
|
|
206
|
+
in_quote = False
|
|
207
|
+
parts: List[List[str]] = [[]]
|
|
208
|
+
for char in qualified_name:
|
|
209
|
+
if char == '"':
|
|
210
|
+
in_quote = not in_quote
|
|
211
|
+
elif char == "." and not in_quote:
|
|
212
|
+
parts.append([])
|
|
213
|
+
else:
|
|
214
|
+
parts[-1].append(char)
|
|
215
|
+
|
|
216
|
+
# Second pass - remove outer pairs of quotes.
|
|
217
|
+
result = []
|
|
218
|
+
for part in parts:
|
|
219
|
+
if len(part) > 2 and part[0] == '"' and part[-1] == '"':
|
|
220
|
+
part = part[1:-1]
|
|
221
|
+
|
|
222
|
+
result.append("".join(part))
|
|
223
|
+
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
|
|
187
227
|
# Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
|
|
188
228
|
# For example "test-database"."test-schema".test_table
|
|
189
229
|
# whereas we generate urns without quotes even for quoted identifiers for backward compatibility
|
|
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
|
|
|
192
232
|
def _cleanup_qualified_name(
|
|
193
233
|
qualified_name: str, structured_reporter: SourceReport
|
|
194
234
|
) -> str:
|
|
195
|
-
name_parts = qualified_name
|
|
235
|
+
name_parts = _split_qualified_name(qualified_name)
|
|
196
236
|
if len(name_parts) != 3:
|
|
197
237
|
if not _is_sys_table(qualified_name):
|
|
198
238
|
structured_reporter.info(
|
|
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
|
|
|
203
243
|
)
|
|
204
244
|
return qualified_name.replace('"', "")
|
|
205
245
|
return _combine_identifier_parts(
|
|
206
|
-
db_name=name_parts[0]
|
|
207
|
-
schema_name=name_parts[1]
|
|
208
|
-
table_name=name_parts[2]
|
|
246
|
+
db_name=name_parts[0],
|
|
247
|
+
schema_name=name_parts[1],
|
|
248
|
+
table_name=name_parts[2],
|
|
209
249
|
)
|
|
210
250
|
|
|
211
251
|
|
|
@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
|
|
|
69
69
|
platform: Optional[str] = None
|
|
70
70
|
source_class = type(self.source)
|
|
71
71
|
if hasattr(source_class, "get_platform_name"):
|
|
72
|
-
platform = source_class.get_platform_name()
|
|
72
|
+
platform = source_class.get_platform_name()
|
|
73
73
|
|
|
74
74
|
# Default name for everything else
|
|
75
75
|
job_name_suffix = self.get_job_name_suffix()
|
|
@@ -353,7 +353,7 @@ class TableauConfig(
|
|
|
353
353
|
|
|
354
354
|
project_path_separator: str = Field(
|
|
355
355
|
default="/",
|
|
356
|
-
description="The separator used for the
|
|
356
|
+
description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
|
|
357
357
|
"You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
|
|
358
358
|
)
|
|
359
359
|
|
|
@@ -959,19 +959,36 @@ class TableauSiteSource:
|
|
|
959
959
|
return is_allowed
|
|
960
960
|
|
|
961
961
|
def _is_denied_project(self, project: TableauProject) -> bool:
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
962
|
+
"""
|
|
963
|
+
Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?
|
|
964
|
+
|
|
965
|
+
Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:
|
|
966
|
+
|
|
967
|
+
- **A**
|
|
968
|
+
- **B** (Child of A)
|
|
969
|
+
- **C** (Child of A)
|
|
970
|
+
- **D**
|
|
971
|
+
|
|
972
|
+
In this setup:
|
|
973
|
+
|
|
974
|
+
- `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
|
|
975
|
+
- `extract_project_hierarchy` is set to `True`.
|
|
976
|
+
|
|
977
|
+
The goal is to extract assets from project A and its children while explicitly denying the child project B.
|
|
978
|
+
|
|
979
|
+
If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
|
|
980
|
+
This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
|
|
981
|
+
However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.
|
|
982
|
+
|
|
983
|
+
To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
|
|
984
|
+
"""
|
|
985
|
+
|
|
986
|
+
# Either project_pattern or project_path_pattern is set in a recipe
|
|
987
|
+
# TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.
|
|
988
|
+
|
|
989
|
+
return self.config.project_pattern.denied(
|
|
990
|
+
project.name
|
|
991
|
+
) or self.config.project_path_pattern.denied(self._get_project_path(project))
|
|
975
992
|
|
|
976
993
|
def _init_tableau_project_registry(self, all_project_map: dict) -> None:
|
|
977
994
|
list_of_skip_projects: List[TableauProject] = []
|
|
@@ -999,9 +1016,11 @@ class TableauSiteSource:
|
|
|
999
1016
|
for project in list_of_skip_projects:
|
|
1000
1017
|
if (
|
|
1001
1018
|
project.parent_id in projects_to_ingest
|
|
1002
|
-
and self._is_denied_project(project)
|
|
1019
|
+
and not self._is_denied_project(project)
|
|
1003
1020
|
):
|
|
1004
|
-
logger.debug(
|
|
1021
|
+
logger.debug(
|
|
1022
|
+
f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
|
|
1023
|
+
)
|
|
1005
1024
|
projects_to_ingest[project.id] = project
|
|
1006
1025
|
|
|
1007
1026
|
# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
|
|
@@ -979,7 +979,6 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
|
|
|
979
979
|
len(query_filter.keys()) == 1
|
|
980
980
|
and query_filter.get(c.ID_WITH_IN)
|
|
981
981
|
and isinstance(query_filter[c.ID_WITH_IN], list)
|
|
982
|
-
and len(query_filter[c.ID_WITH_IN]) > 100 * page_size
|
|
983
982
|
):
|
|
984
983
|
ids = query_filter[c.ID_WITH_IN]
|
|
985
984
|
filter_pages = [
|
|
@@ -23267,7 +23267,7 @@ class StructuredPropertyDefinitionClass(_Aspect):
|
|
|
23267
23267
|
|
|
23268
23268
|
@property
|
|
23269
23269
|
def lastModified(self) -> Union[None, "AuditStampClass"]:
|
|
23270
|
-
"""
|
|
23270
|
+
"""Last Modified Audit stamp"""
|
|
23271
23271
|
return self._inner_dict.get('lastModified') # type: ignore
|
|
23272
23272
|
|
|
23273
23273
|
@lastModified.setter
|
|
@@ -23280,7 +23280,7 @@ class StructuredPropertyKeyClass(_Aspect):
|
|
|
23280
23280
|
|
|
23281
23281
|
|
|
23282
23282
|
ASPECT_NAME = 'structuredPropertyKey'
|
|
23283
|
-
ASPECT_INFO = {'keyForEntity': 'structuredProperty', 'entityCategory': 'core', 'entityAspects': ['propertyDefinition', 'institutionalMemory', 'status'], 'entityDoc': 'Structured Property represents a property meant for extending the core model of a logical entity'}
|
|
23283
|
+
ASPECT_INFO = {'keyForEntity': 'structuredProperty', 'entityCategory': 'core', 'entityAspects': ['propertyDefinition', 'structuredPropertySettings', 'institutionalMemory', 'status'], 'entityDoc': 'Structured Property represents a property meant for extending the core model of a logical entity'}
|
|
23284
23284
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.structured.StructuredPropertyKey")
|
|
23285
23285
|
|
|
23286
23286
|
def __init__(self,
|
|
@@ -23304,6 +23304,122 @@ class StructuredPropertyKeyClass(_Aspect):
|
|
|
23304
23304
|
self._inner_dict['id'] = value
|
|
23305
23305
|
|
|
23306
23306
|
|
|
23307
|
+
class StructuredPropertySettingsClass(_Aspect):
|
|
23308
|
+
"""Settings specific to a structured property entity"""
|
|
23309
|
+
|
|
23310
|
+
|
|
23311
|
+
ASPECT_NAME = 'structuredPropertySettings'
|
|
23312
|
+
ASPECT_INFO = {}
|
|
23313
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.structured.StructuredPropertySettings")
|
|
23314
|
+
|
|
23315
|
+
def __init__(self,
|
|
23316
|
+
isHidden: Optional[bool]=None,
|
|
23317
|
+
showInSearchFilters: Optional[bool]=None,
|
|
23318
|
+
showInAssetSummary: Optional[bool]=None,
|
|
23319
|
+
showAsAssetBadge: Optional[bool]=None,
|
|
23320
|
+
showInColumnsTable: Optional[bool]=None,
|
|
23321
|
+
lastModified: Union[None, "AuditStampClass"]=None,
|
|
23322
|
+
):
|
|
23323
|
+
super().__init__()
|
|
23324
|
+
|
|
23325
|
+
if isHidden is None:
|
|
23326
|
+
# default: False
|
|
23327
|
+
self.isHidden = self.RECORD_SCHEMA.fields_dict["isHidden"].default
|
|
23328
|
+
else:
|
|
23329
|
+
self.isHidden = isHidden
|
|
23330
|
+
if showInSearchFilters is None:
|
|
23331
|
+
# default: False
|
|
23332
|
+
self.showInSearchFilters = self.RECORD_SCHEMA.fields_dict["showInSearchFilters"].default
|
|
23333
|
+
else:
|
|
23334
|
+
self.showInSearchFilters = showInSearchFilters
|
|
23335
|
+
if showInAssetSummary is None:
|
|
23336
|
+
# default: False
|
|
23337
|
+
self.showInAssetSummary = self.RECORD_SCHEMA.fields_dict["showInAssetSummary"].default
|
|
23338
|
+
else:
|
|
23339
|
+
self.showInAssetSummary = showInAssetSummary
|
|
23340
|
+
if showAsAssetBadge is None:
|
|
23341
|
+
# default: False
|
|
23342
|
+
self.showAsAssetBadge = self.RECORD_SCHEMA.fields_dict["showAsAssetBadge"].default
|
|
23343
|
+
else:
|
|
23344
|
+
self.showAsAssetBadge = showAsAssetBadge
|
|
23345
|
+
if showInColumnsTable is None:
|
|
23346
|
+
# default: False
|
|
23347
|
+
self.showInColumnsTable = self.RECORD_SCHEMA.fields_dict["showInColumnsTable"].default
|
|
23348
|
+
else:
|
|
23349
|
+
self.showInColumnsTable = showInColumnsTable
|
|
23350
|
+
self.lastModified = lastModified
|
|
23351
|
+
|
|
23352
|
+
def _restore_defaults(self) -> None:
|
|
23353
|
+
self.isHidden = self.RECORD_SCHEMA.fields_dict["isHidden"].default
|
|
23354
|
+
self.showInSearchFilters = self.RECORD_SCHEMA.fields_dict["showInSearchFilters"].default
|
|
23355
|
+
self.showInAssetSummary = self.RECORD_SCHEMA.fields_dict["showInAssetSummary"].default
|
|
23356
|
+
self.showAsAssetBadge = self.RECORD_SCHEMA.fields_dict["showAsAssetBadge"].default
|
|
23357
|
+
self.showInColumnsTable = self.RECORD_SCHEMA.fields_dict["showInColumnsTable"].default
|
|
23358
|
+
self.lastModified = self.RECORD_SCHEMA.fields_dict["lastModified"].default
|
|
23359
|
+
|
|
23360
|
+
|
|
23361
|
+
@property
|
|
23362
|
+
def isHidden(self) -> bool:
|
|
23363
|
+
"""Whether or not this asset should be hidden in the main application"""
|
|
23364
|
+
return self._inner_dict.get('isHidden') # type: ignore
|
|
23365
|
+
|
|
23366
|
+
@isHidden.setter
|
|
23367
|
+
def isHidden(self, value: bool) -> None:
|
|
23368
|
+
self._inner_dict['isHidden'] = value
|
|
23369
|
+
|
|
23370
|
+
|
|
23371
|
+
@property
|
|
23372
|
+
def showInSearchFilters(self) -> bool:
|
|
23373
|
+
"""Whether or not this asset should be displayed as a search filter"""
|
|
23374
|
+
return self._inner_dict.get('showInSearchFilters') # type: ignore
|
|
23375
|
+
|
|
23376
|
+
@showInSearchFilters.setter
|
|
23377
|
+
def showInSearchFilters(self, value: bool) -> None:
|
|
23378
|
+
self._inner_dict['showInSearchFilters'] = value
|
|
23379
|
+
|
|
23380
|
+
|
|
23381
|
+
@property
|
|
23382
|
+
def showInAssetSummary(self) -> bool:
|
|
23383
|
+
"""Whether or not this asset should be displayed in the asset sidebar"""
|
|
23384
|
+
return self._inner_dict.get('showInAssetSummary') # type: ignore
|
|
23385
|
+
|
|
23386
|
+
@showInAssetSummary.setter
|
|
23387
|
+
def showInAssetSummary(self, value: bool) -> None:
|
|
23388
|
+
self._inner_dict['showInAssetSummary'] = value
|
|
23389
|
+
|
|
23390
|
+
|
|
23391
|
+
@property
|
|
23392
|
+
def showAsAssetBadge(self) -> bool:
|
|
23393
|
+
"""Whether or not this asset should be displayed as an asset badge on other
|
|
23394
|
+
asset's headers"""
|
|
23395
|
+
return self._inner_dict.get('showAsAssetBadge') # type: ignore
|
|
23396
|
+
|
|
23397
|
+
@showAsAssetBadge.setter
|
|
23398
|
+
def showAsAssetBadge(self, value: bool) -> None:
|
|
23399
|
+
self._inner_dict['showAsAssetBadge'] = value
|
|
23400
|
+
|
|
23401
|
+
|
|
23402
|
+
@property
|
|
23403
|
+
def showInColumnsTable(self) -> bool:
|
|
23404
|
+
"""Whether or not this asset should be displayed as a column in the schema field table
|
|
23405
|
+
in a Dataset's "Columns" tab."""
|
|
23406
|
+
return self._inner_dict.get('showInColumnsTable') # type: ignore
|
|
23407
|
+
|
|
23408
|
+
@showInColumnsTable.setter
|
|
23409
|
+
def showInColumnsTable(self, value: bool) -> None:
|
|
23410
|
+
self._inner_dict['showInColumnsTable'] = value
|
|
23411
|
+
|
|
23412
|
+
|
|
23413
|
+
@property
|
|
23414
|
+
def lastModified(self) -> Union[None, "AuditStampClass"]:
|
|
23415
|
+
"""Last Modified Audit stamp"""
|
|
23416
|
+
return self._inner_dict.get('lastModified') # type: ignore
|
|
23417
|
+
|
|
23418
|
+
@lastModified.setter
|
|
23419
|
+
def lastModified(self, value: Union[None, "AuditStampClass"]) -> None:
|
|
23420
|
+
self._inner_dict['lastModified'] = value
|
|
23421
|
+
|
|
23422
|
+
|
|
23307
23423
|
class StructuredPropertyValueAssignmentClass(DictWrapper):
|
|
23308
23424
|
# No docs available.
|
|
23309
23425
|
|
|
@@ -24775,6 +24891,7 @@ __SCHEMA_TYPES = {
|
|
|
24775
24891
|
'com.linkedin.pegasus2avro.structured.StructuredProperties': StructuredPropertiesClass,
|
|
24776
24892
|
'com.linkedin.pegasus2avro.structured.StructuredPropertyDefinition': StructuredPropertyDefinitionClass,
|
|
24777
24893
|
'com.linkedin.pegasus2avro.structured.StructuredPropertyKey': StructuredPropertyKeyClass,
|
|
24894
|
+
'com.linkedin.pegasus2avro.structured.StructuredPropertySettings': StructuredPropertySettingsClass,
|
|
24778
24895
|
'com.linkedin.pegasus2avro.structured.StructuredPropertyValueAssignment': StructuredPropertyValueAssignmentClass,
|
|
24779
24896
|
'com.linkedin.pegasus2avro.tag.TagProperties': TagPropertiesClass,
|
|
24780
24897
|
'com.linkedin.pegasus2avro.telemetry.TelemetryClientId': TelemetryClientIdClass,
|
|
@@ -25240,6 +25357,7 @@ __SCHEMA_TYPES = {
|
|
|
25240
25357
|
'StructuredProperties': StructuredPropertiesClass,
|
|
25241
25358
|
'StructuredPropertyDefinition': StructuredPropertyDefinitionClass,
|
|
25242
25359
|
'StructuredPropertyKey': StructuredPropertyKeyClass,
|
|
25360
|
+
'StructuredPropertySettings': StructuredPropertySettingsClass,
|
|
25243
25361
|
'StructuredPropertyValueAssignment': StructuredPropertyValueAssignmentClass,
|
|
25244
25362
|
'TagProperties': TagPropertiesClass,
|
|
25245
25363
|
'TelemetryClientId': TelemetryClientIdClass,
|
|
@@ -25336,6 +25454,7 @@ ASPECT_CLASSES: List[Type[_Aspect]] = [
|
|
|
25336
25454
|
QuerySubjectsClass,
|
|
25337
25455
|
StructuredPropertyKeyClass,
|
|
25338
25456
|
StructuredPropertyDefinitionClass,
|
|
25457
|
+
StructuredPropertySettingsClass,
|
|
25339
25458
|
StructuredPropertiesClass,
|
|
25340
25459
|
GlobalSettingsInfoClass,
|
|
25341
25460
|
DataHubRetentionConfigClass,
|
|
@@ -25548,6 +25667,7 @@ class AspectBag(TypedDict, total=False):
|
|
|
25548
25667
|
querySubjects: QuerySubjectsClass
|
|
25549
25668
|
structuredPropertyKey: StructuredPropertyKeyClass
|
|
25550
25669
|
propertyDefinition: StructuredPropertyDefinitionClass
|
|
25670
|
+
structuredPropertySettings: StructuredPropertySettingsClass
|
|
25551
25671
|
structuredProperties: StructuredPropertiesClass
|
|
25552
25672
|
globalSettingsInfo: GlobalSettingsInfoClass
|
|
25553
25673
|
dataHubRetentionConfig: DataHubRetentionConfigClass
|
|
@@ -12,6 +12,7 @@ from .....schema_classes import PropertyValueClass
|
|
|
12
12
|
from .....schema_classes import StructuredPropertiesClass
|
|
13
13
|
from .....schema_classes import StructuredPropertyDefinitionClass
|
|
14
14
|
from .....schema_classes import StructuredPropertyKeyClass
|
|
15
|
+
from .....schema_classes import StructuredPropertySettingsClass
|
|
15
16
|
from .....schema_classes import StructuredPropertyValueAssignmentClass
|
|
16
17
|
|
|
17
18
|
|
|
@@ -20,6 +21,7 @@ PropertyValue = PropertyValueClass
|
|
|
20
21
|
StructuredProperties = StructuredPropertiesClass
|
|
21
22
|
StructuredPropertyDefinition = StructuredPropertyDefinitionClass
|
|
22
23
|
StructuredPropertyKey = StructuredPropertyKeyClass
|
|
24
|
+
StructuredPropertySettings = StructuredPropertySettingsClass
|
|
23
25
|
StructuredPropertyValueAssignment = StructuredPropertyValueAssignmentClass
|
|
24
26
|
|
|
25
27
|
# fmt: on
|