acryl-datahub 0.15.0rc16__py3-none-any.whl → 0.15.0rc18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc18.dist-info}/METADATA +2319 -2319
  2. {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc18.dist-info}/RECORD +34 -32
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/structuredproperties/structuredproperties.py +7 -5
  5. datahub/cli/delete_cli.py +66 -20
  6. datahub/configuration/common.py +3 -3
  7. datahub/ingestion/api/source.py +5 -1
  8. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  9. datahub/ingestion/run/pipeline.py +1 -1
  10. datahub/ingestion/run/pipeline_config.py +6 -0
  11. datahub/ingestion/source/kafka/kafka.py +18 -11
  12. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  13. datahub/ingestion/source/looker/view_upstream.py +65 -30
  14. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  15. datahub/ingestion/source/snowflake/snowflake_query.py +6 -2
  16. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  17. datahub/ingestion/source/snowflake/snowflake_schema.py +12 -0
  18. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +17 -2
  19. datahub/ingestion/source/snowflake/snowflake_utils.py +45 -5
  20. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  21. datahub/ingestion/source/tableau/tableau.py +35 -16
  22. datahub/ingestion/source/tableau/tableau_common.py +0 -1
  23. datahub/metadata/_schema_classes.py +122 -2
  24. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  25. datahub/metadata/schema.avsc +73 -1
  26. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  27. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  28. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  29. datahub/sql_parsing/schema_resolver.py +23 -0
  30. datahub/sql_parsing/sqlglot_lineage.py +48 -13
  31. datahub/testing/doctest.py +12 -0
  32. {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc18.dist-info}/WHEEL +0 -0
  33. {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc18.dist-info}/entry_points.txt +0 -0
  34. {acryl_datahub-0.15.0rc16.dist-info → acryl_datahub-0.15.0rc18.dist-info}/top_level.txt +0 -0
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
25
25
  LookMLSourceReport,
26
26
  )
27
27
  from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
28
+ from datahub.sql_parsing.schema_resolver import match_columns_to_schema
28
29
  from datahub.sql_parsing.sqlglot_lineage import (
29
30
  ColumnLineageInfo,
30
31
  ColumnRef,
31
32
  SqlParsingResult,
32
33
  Urn,
34
+ create_and_cache_schema_resolver,
33
35
  create_lineage_sql_parsed_result,
34
36
  )
35
37
 
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
200
202
  class AbstractViewUpstream(ABC):
201
203
  """
202
204
  Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
203
- For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
205
+ For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
204
206
  """
205
207
 
206
208
  view_context: LookerViewContext
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
236
238
  def create_fields(self) -> List[ViewField]:
237
239
  return [] # it is for the special case
238
240
 
241
+ def create_upstream_column_refs(
242
+ self, upstream_urn: str, downstream_looker_columns: List[str]
243
+ ) -> List[ColumnRef]:
244
+ """
245
+ - **`upstream_urn`**: The URN of the upstream dataset.
246
+
247
+ - **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
248
+
249
+ - This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
250
+ """
251
+ schema_resolver = create_and_cache_schema_resolver(
252
+ platform=self.view_context.view_connection.platform,
253
+ platform_instance=self.view_context.view_connection.platform_instance,
254
+ env=self.view_context.view_connection.platform_env or self.config.env,
255
+ graph=self.ctx.graph,
256
+ )
257
+
258
+ urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
259
+
260
+ if schema_info:
261
+ actual_columns = match_columns_to_schema(
262
+ schema_info, downstream_looker_columns
263
+ )
264
+ else:
265
+ logger.info(
266
+ f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
267
+ )
268
+ actual_columns = [column.lower() for column in downstream_looker_columns]
269
+
270
+ upstream_column_refs: List[ColumnRef] = []
271
+
272
+ for column in actual_columns:
273
+ upstream_column_refs.append(
274
+ ColumnRef(
275
+ column=column,
276
+ table=upstream_urn,
277
+ )
278
+ )
279
+
280
+ return upstream_column_refs
281
+
239
282
 
240
283
  class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
241
284
  """
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
372
415
  # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
373
416
  # referring to upstream table
374
417
  if self._get_upstream_dataset_urn() and not upstreams_column_refs:
375
- upstreams_column_refs = [
376
- ColumnRef(
377
- table=self._get_upstream_dataset_urn()[
378
- 0
379
- ], # 0th index has table of from clause
380
- column=column,
381
- )
382
- for column in field_context.column_name_in_sql_attribute()
383
- ]
418
+ upstreams_column_refs = self.create_upstream_column_refs(
419
+ upstream_urn=self._get_upstream_dataset_urn()[
420
+ 0
421
+ ], # 0th index has table of from clause,
422
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
423
+ )
384
424
 
385
425
  # fix any derived view reference present in urn
386
426
  upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
487
527
  return upstream_column_refs
488
528
 
489
529
  explore_urn: str = self._get_upstream_dataset_urn()[0]
530
+ expected_columns: List[str] = []
490
531
 
491
532
  for column in field_context.column_name_in_sql_attribute():
492
533
  if column in self._get_explore_column_mapping():
493
534
  explore_column: Dict = self._get_explore_column_mapping()[column]
494
- upstream_column_refs.append(
495
- ColumnRef(
496
- column=explore_column.get("field", explore_column[NAME]),
497
- table=explore_urn,
498
- )
535
+ expected_columns.append(
536
+ explore_column.get("field", explore_column[NAME])
499
537
  )
500
538
 
501
- return upstream_column_refs
539
+ return self.create_upstream_column_refs(
540
+ upstream_urn=explore_urn, downstream_looker_columns=expected_columns
541
+ )
502
542
 
503
543
  def get_upstream_dataset_urn(self) -> List[Urn]:
504
544
  return self._get_upstream_dataset_urn()
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
548
588
  def get_upstream_column_ref(
549
589
  self, field_context: LookerFieldContext
550
590
  ) -> List[ColumnRef]:
551
- upstream_column_ref: List[ColumnRef] = []
552
-
553
- for column_name in field_context.column_name_in_sql_attribute():
554
- upstream_column_ref.append(
555
- ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
556
- )
557
-
558
- return upstream_column_ref
591
+ return self.create_upstream_column_refs(
592
+ upstream_urn=self._get_upstream_dataset_urn(),
593
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
594
+ )
559
595
 
560
596
  def get_upstream_dataset_urn(self) -> List[Urn]:
561
597
  return [self._get_upstream_dataset_urn()]
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
609
645
  self, field_context: LookerFieldContext
610
646
  ) -> List[ColumnRef]:
611
647
  upstream_column_ref: List[ColumnRef] = []
648
+
612
649
  if not self._get_upstream_dataset_urn():
613
650
  return upstream_column_ref
614
651
 
615
- for column_name in field_context.column_name_in_sql_attribute():
616
- upstream_column_ref.append(
617
- ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name)
618
- )
619
-
620
- return upstream_column_ref
652
+ return self.create_upstream_column_refs(
653
+ upstream_urn=self._get_upstream_dataset_urn()[0],
654
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
655
+ )
621
656
 
622
657
  def get_upstream_dataset_urn(self) -> List[Urn]:
623
658
  return self._get_upstream_dataset_urn()
@@ -45,6 +45,9 @@ class Owners(ConfigModel):
45
45
  groups: Optional[List[str]] = None
46
46
 
47
47
 
48
+ OwnersMultipleTypes = Union[List[Owners], Owners]
49
+
50
+
48
51
  class KnowledgeCard(ConfigModel):
49
52
  url: Optional[str] = None
50
53
  label: Optional[str] = None
@@ -57,7 +60,7 @@ class GlossaryTermConfig(ConfigModel):
57
60
  term_source: Optional[str] = None
58
61
  source_ref: Optional[str] = None
59
62
  source_url: Optional[str] = None
60
- owners: Optional[Owners] = None
63
+ owners: Optional[OwnersMultipleTypes] = None
61
64
  inherits: Optional[List[str]] = None
62
65
  contains: Optional[List[str]] = None
63
66
  values: Optional[List[str]] = None
@@ -74,7 +77,7 @@ class GlossaryNodeConfig(ConfigModel):
74
77
  id: Optional[str] = None
75
78
  name: str
76
79
  description: str
77
- owners: Optional[Owners] = None
80
+ owners: Optional[OwnersMultipleTypes] = None
78
81
  terms: Optional[List["GlossaryTermConfig"]] = None
79
82
  nodes: Optional[List["GlossaryNodeConfig"]] = None
80
83
  knowledge_links: Optional[List[KnowledgeCard]] = None
@@ -88,7 +91,7 @@ class DefaultConfig(ConfigModel):
88
91
  """Holds defaults for populating fields in glossary terms"""
89
92
 
90
93
  source: Optional[str] = None
91
- owners: Owners
94
+ owners: OwnersMultipleTypes
92
95
  url: Optional[str] = None
93
96
  source_type: str = "INTERNAL"
94
97
 
@@ -153,30 +156,44 @@ def make_glossary_term_urn(
153
156
  return "urn:li:glossaryTerm:" + create_id(path, default_id, enable_auto_id)
154
157
 
155
158
 
156
- def get_owners(owners: Owners) -> models.OwnershipClass:
157
- ownership_type, ownership_type_urn = validate_ownership_type(owners.type)
159
+ def get_owners_multiple_types(owners: OwnersMultipleTypes) -> models.OwnershipClass:
160
+ """Allows owner types to be a list and maintains backward compatibility"""
161
+ if isinstance(owners, Owners):
162
+ return models.OwnershipClass(owners=list(get_owners(owners)))
163
+
164
+ owners_meta: List[models.OwnerClass] = []
165
+ for owner in owners:
166
+ owners_meta.extend(get_owners(owner))
167
+
168
+ return models.OwnershipClass(owners=owners_meta)
169
+
170
+
171
+ def get_owners(owners: Owners) -> Iterable[models.OwnerClass]:
172
+ actual_type = owners.type or models.OwnershipTypeClass.DEVELOPER
173
+
174
+ if actual_type.startswith("urn:li:ownershipType:"):
175
+ ownership_type: str = "CUSTOM"
176
+ ownership_type_urn: Optional[str] = actual_type
177
+ else:
178
+ ownership_type, ownership_type_urn = validate_ownership_type(actual_type)
179
+
158
180
  if owners.typeUrn is not None:
159
181
  ownership_type_urn = owners.typeUrn
160
- owners_meta: List[models.OwnerClass] = []
182
+
161
183
  if owners.users is not None:
162
- owners_meta = owners_meta + [
163
- models.OwnerClass(
184
+ for o in owners.users:
185
+ yield models.OwnerClass(
164
186
  owner=make_user_urn(o),
165
187
  type=ownership_type,
166
188
  typeUrn=ownership_type_urn,
167
189
  )
168
- for o in owners.users
169
- ]
170
190
  if owners.groups is not None:
171
- owners_meta = owners_meta + [
172
- models.OwnerClass(
191
+ for o in owners.groups:
192
+ yield models.OwnerClass(
173
193
  owner=make_group_urn(o),
174
194
  type=ownership_type,
175
195
  typeUrn=ownership_type_urn,
176
196
  )
177
- for o in owners.groups
178
- ]
179
- return models.OwnershipClass(owners=owners_meta)
180
197
 
181
198
 
182
199
  def get_mces(
@@ -185,7 +202,7 @@ def get_mces(
185
202
  ingestion_config: BusinessGlossarySourceConfig,
186
203
  ctx: PipelineContext,
187
204
  ) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]:
188
- root_owners = get_owners(glossary.owners)
205
+ root_owners = get_owners_multiple_types(glossary.owners)
189
206
 
190
207
  if glossary.nodes:
191
208
  for node in glossary.nodes:
@@ -270,7 +287,7 @@ def get_mces_from_node(
270
287
  node_owners = parentOwners
271
288
  if glossaryNode.owners is not None:
272
289
  assert glossaryNode.owners is not None
273
- node_owners = get_owners(glossaryNode.owners)
290
+ node_owners = get_owners_multiple_types(glossaryNode.owners)
274
291
 
275
292
  node_snapshot = models.GlossaryNodeSnapshotClass(
276
293
  urn=node_urn,
@@ -426,7 +443,7 @@ def get_mces_from_term(
426
443
  ownership: models.OwnershipClass = parentOwnership
427
444
  if glossaryTerm.owners is not None:
428
445
  assert glossaryTerm.owners is not None
429
- ownership = get_owners(glossaryTerm.owners)
446
+ ownership = get_owners_multiple_types(glossaryTerm.owners)
430
447
  aspects.append(ownership)
431
448
 
432
449
  if glossaryTerm.domain is not None:
@@ -129,7 +129,9 @@ class SnowflakeQuery:
129
129
  row_count AS "ROW_COUNT",
130
130
  bytes AS "BYTES",
131
131
  clustering_key AS "CLUSTERING_KEY",
132
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
132
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
133
+ is_dynamic AS "IS_DYNAMIC",
134
+ is_iceberg AS "IS_ICEBERG"
133
135
  FROM {db_clause}information_schema.tables t
134
136
  WHERE table_schema != 'INFORMATION_SCHEMA'
135
137
  and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
@@ -149,7 +151,9 @@ class SnowflakeQuery:
149
151
  row_count AS "ROW_COUNT",
150
152
  bytes AS "BYTES",
151
153
  clustering_key AS "CLUSTERING_KEY",
152
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
154
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
155
+ is_dynamic AS "IS_DYNAMIC",
156
+ is_iceberg AS "IS_ICEBERG"
153
157
  FROM {db_clause}information_schema.tables t
154
158
  where table_schema='{schema_name}'
155
159
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
113
113
  external_lineage_queries_secs: float = -1
114
114
  num_tables_with_known_upstreams: int = 0
115
115
  num_upstream_lineage_edge_parsing_failed: int = 0
116
+ num_secure_views_missing_definition: int = 0
116
117
 
117
118
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
118
119
 
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
90
90
  foreign_keys: List[SnowflakeFK] = field(default_factory=list)
91
91
  tags: Optional[List[SnowflakeTag]] = None
92
92
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
93
+ is_dynamic: bool = False
94
+ is_iceberg: bool = False
95
+
96
+ @property
97
+ def is_hybrid(self) -> bool:
98
+ return self.type is not None and self.type == "HYBRID TABLE"
93
99
 
94
100
 
95
101
  @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
98
104
  columns: List[SnowflakeColumn] = field(default_factory=list)
99
105
  tags: Optional[List[SnowflakeTag]] = None
100
106
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
107
+ is_secure: bool = False
101
108
 
102
109
 
103
110
  @dataclass
@@ -289,6 +296,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
289
296
  rows_count=table["ROW_COUNT"],
290
297
  comment=table["COMMENT"],
291
298
  clustering_key=table["CLUSTERING_KEY"],
299
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
300
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
292
301
  )
293
302
  )
294
303
  return tables
@@ -313,6 +322,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
313
322
  rows_count=table["ROW_COUNT"],
314
323
  comment=table["COMMENT"],
315
324
  clustering_key=table["CLUSTERING_KEY"],
325
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
326
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
316
327
  )
317
328
  )
318
329
  return tables
@@ -356,6 +367,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
356
367
  materialized=(
357
368
  view.get("is_materialized", "false").lower() == "true"
358
369
  ),
370
+ is_secure=(view.get("is_secure", "false").lower() == "true"),
359
371
  )
360
372
  )
361
373
 
@@ -431,6 +431,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
431
431
  default_db=db_name,
432
432
  default_schema=schema_name,
433
433
  )
434
+ elif view.is_secure:
435
+ self.report.num_secure_views_missing_definition += 1
434
436
 
435
437
  if self.config.include_technical_schema:
436
438
  for view in views:
@@ -749,8 +751,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
749
751
  ) -> DatasetProperties:
750
752
  custom_properties = {}
751
753
 
752
- if isinstance(table, SnowflakeTable) and table.clustering_key:
753
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
754
+ if isinstance(table, SnowflakeTable):
755
+ if table.clustering_key:
756
+ custom_properties["CLUSTERING_KEY"] = table.clustering_key
757
+
758
+ if table.is_hybrid:
759
+ custom_properties["IS_HYBRID"] = "true"
760
+
761
+ if table.is_dynamic:
762
+ custom_properties["IS_DYNAMIC"] = "true"
763
+
764
+ if table.is_iceberg:
765
+ custom_properties["IS_ICEBERG"] = "true"
766
+
767
+ if isinstance(table, SnowflakeView) and table.is_secure:
768
+ custom_properties["IS_SECURE"] = "true"
754
769
 
755
770
  return DatasetProperties(
756
771
  name=table.name,
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  from functools import cached_property
3
- from typing import ClassVar, Literal, Optional, Tuple
3
+ from typing import ClassVar, List, Literal, Optional, Tuple
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
184
184
  return table_name.lower().startswith("sys$")
185
185
 
186
186
 
187
+ def _split_qualified_name(qualified_name: str) -> List[str]:
188
+ """
189
+ Split a qualified name into its constituent parts.
190
+
191
+ >>> _split_qualified_name("db.my_schema.my_table")
192
+ ['db', 'my_schema', 'my_table']
193
+ >>> _split_qualified_name('"db"."my_schema"."my_table"')
194
+ ['db', 'my_schema', 'my_table']
195
+ >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
196
+ ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
197
+ >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
198
+ ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
199
+ """
200
+
201
+ # Fast path - no quotes.
202
+ if '"' not in qualified_name:
203
+ return qualified_name.split(".")
204
+
205
+ # First pass - split on dots that are not inside quotes.
206
+ in_quote = False
207
+ parts: List[List[str]] = [[]]
208
+ for char in qualified_name:
209
+ if char == '"':
210
+ in_quote = not in_quote
211
+ elif char == "." and not in_quote:
212
+ parts.append([])
213
+ else:
214
+ parts[-1].append(char)
215
+
216
+ # Second pass - remove outer pairs of quotes.
217
+ result = []
218
+ for part in parts:
219
+ if len(part) > 2 and part[0] == '"' and part[-1] == '"':
220
+ part = part[1:-1]
221
+
222
+ result.append("".join(part))
223
+
224
+ return result
225
+
226
+
187
227
  # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
188
228
  # For example "test-database"."test-schema".test_table
189
229
  # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
192
232
  def _cleanup_qualified_name(
193
233
  qualified_name: str, structured_reporter: SourceReport
194
234
  ) -> str:
195
- name_parts = qualified_name.split(".")
235
+ name_parts = _split_qualified_name(qualified_name)
196
236
  if len(name_parts) != 3:
197
237
  if not _is_sys_table(qualified_name):
198
238
  structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
203
243
  )
204
244
  return qualified_name.replace('"', "")
205
245
  return _combine_identifier_parts(
206
- db_name=name_parts[0].strip('"'),
207
- schema_name=name_parts[1].strip('"'),
208
- table_name=name_parts[2].strip('"'),
246
+ db_name=name_parts[0],
247
+ schema_name=name_parts[1],
248
+ table_name=name_parts[2],
209
249
  )
210
250
 
211
251
 
@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
69
69
  platform: Optional[str] = None
70
70
  source_class = type(self.source)
71
71
  if hasattr(source_class, "get_platform_name"):
72
- platform = source_class.get_platform_name() # type: ignore
72
+ platform = source_class.get_platform_name()
73
73
 
74
74
  # Default name for everything else
75
75
  job_name_suffix = self.get_job_name_suffix()
@@ -353,7 +353,7 @@ class TableauConfig(
353
353
 
354
354
  project_path_separator: str = Field(
355
355
  default="/",
356
- description="The separator used for the project_pattern field between project names. By default, we use a slash. "
356
+ description="The separator used for the project_path_pattern field between project names. By default, we use a slash. "
357
357
  "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.",
358
358
  )
359
359
 
@@ -959,19 +959,36 @@ class TableauSiteSource:
959
959
  return is_allowed
960
960
 
961
961
  def _is_denied_project(self, project: TableauProject) -> bool:
962
- # Either project name or project path should exist in deny
963
- for deny_pattern in self.config.project_pattern.deny:
964
- # Either name or project path is denied
965
- if re.match(
966
- deny_pattern, project.name, self.config.project_pattern.regex_flags
967
- ) or re.match(
968
- deny_pattern,
969
- self._get_project_path(project),
970
- self.config.project_pattern.regex_flags,
971
- ):
972
- return True
973
- logger.info(f"project({project.name}) is not denied as per project_pattern")
974
- return False
962
+ """
963
+ Why use an explicit denial check instead of the `AllowDenyPattern.allowed` method?
964
+
965
+ Consider a scenario where a Tableau site contains four projects: A, B, C, and D, with the following hierarchical relationship:
966
+
967
+ - **A**
968
+ - **B** (Child of A)
969
+ - **C** (Child of A)
970
+ - **D**
971
+
972
+ In this setup:
973
+
974
+ - `project_pattern` is configured with `allow: ["A"]` and `deny: ["B"]`.
975
+ - `extract_project_hierarchy` is set to `True`.
976
+
977
+ The goal is to extract assets from project A and its children while explicitly denying the child project B.
978
+
979
+ If we rely solely on the `project_pattern.allowed()` method, project C's assets will not be ingested.
980
+ This happens because project C is not explicitly included in the `allow` list, nor is it part of the `deny` list.
981
+ However, since `extract_project_hierarchy` is enabled, project C should ideally be included in the ingestion process unless explicitly denied.
982
+
983
+ To address this, the function explicitly checks the deny regex to ensure that project C’s assets are ingested if it is not specifically denied in the deny list. This approach ensures that the hierarchy is respected while adhering to the configured allow/deny rules.
984
+ """
985
+
986
+ # Either project_pattern or project_path_pattern is set in a recipe
987
+ # TableauConfig.projects_backward_compatibility ensures that at least one of these properties is configured.
988
+
989
+ return self.config.project_pattern.denied(
990
+ project.name
991
+ ) or self.config.project_path_pattern.denied(self._get_project_path(project))
975
992
 
976
993
  def _init_tableau_project_registry(self, all_project_map: dict) -> None:
977
994
  list_of_skip_projects: List[TableauProject] = []
@@ -999,9 +1016,11 @@ class TableauSiteSource:
999
1016
  for project in list_of_skip_projects:
1000
1017
  if (
1001
1018
  project.parent_id in projects_to_ingest
1002
- and self._is_denied_project(project) is False
1019
+ and not self._is_denied_project(project)
1003
1020
  ):
1004
- logger.debug(f"Project {project.name} is added in project registry")
1021
+ logger.debug(
1022
+ f"Project {project.name} is added in project registry as it's a child project and not explicitly denied in `deny` list"
1023
+ )
1005
1024
  projects_to_ingest[project.id] = project
1006
1025
 
1007
1026
  # We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
@@ -979,7 +979,6 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
979
979
  len(query_filter.keys()) == 1
980
980
  and query_filter.get(c.ID_WITH_IN)
981
981
  and isinstance(query_filter[c.ID_WITH_IN], list)
982
- and len(query_filter[c.ID_WITH_IN]) > 100 * page_size
983
982
  ):
984
983
  ids = query_filter[c.ID_WITH_IN]
985
984
  filter_pages = [