acryl-datahub 0.15.0rc15__py3-none-any.whl → 0.15.0rc17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/METADATA +2485 -2501
  2. {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/RECORD +49 -49
  3. datahub/__init__.py +1 -1
  4. datahub/api/entities/structuredproperties/structuredproperties.py +7 -5
  5. datahub/cli/cli_utils.py +2 -0
  6. datahub/cli/delete_cli.py +66 -20
  7. datahub/configuration/common.py +3 -3
  8. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  9. datahub/ingestion/api/source.py +5 -1
  10. datahub/ingestion/api/source_helpers.py +3 -1
  11. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  12. datahub/ingestion/run/pipeline.py +1 -1
  13. datahub/ingestion/run/pipeline_config.py +6 -0
  14. datahub/ingestion/sink/datahub_rest.py +3 -3
  15. datahub/ingestion/source/abs/source.py +4 -0
  16. datahub/ingestion/source/gc/datahub_gc.py +5 -5
  17. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +1 -1
  18. datahub/ingestion/source/kafka/kafka.py +18 -11
  19. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  20. datahub/ingestion/source/looker/view_upstream.py +65 -30
  21. datahub/ingestion/source/mode.py +0 -23
  22. datahub/ingestion/source/redash.py +13 -63
  23. datahub/ingestion/source/redshift/config.py +1 -0
  24. datahub/ingestion/source/redshift/redshift.py +2 -0
  25. datahub/ingestion/source/snowflake/snowflake_config.py +4 -0
  26. datahub/ingestion/source/snowflake/snowflake_query.py +6 -2
  27. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  28. datahub/ingestion/source/snowflake/snowflake_schema.py +12 -0
  29. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +17 -2
  30. datahub/ingestion/source/snowflake/snowflake_utils.py +45 -5
  31. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  32. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  33. datahub/ingestion/source/tableau/tableau.py +35 -16
  34. datahub/ingestion/source/tableau/tableau_common.py +0 -1
  35. datahub/ingestion/source/unity/source.py +2 -0
  36. datahub/ingestion/source/unity/usage.py +20 -11
  37. datahub/metadata/_schema_classes.py +122 -2
  38. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  39. datahub/metadata/schema.avsc +73 -1
  40. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  41. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  42. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  43. datahub/sql_parsing/schema_resolver.py +23 -0
  44. datahub/sql_parsing/sqlglot_lineage.py +48 -13
  45. datahub/testing/doctest.py +12 -0
  46. datahub/utilities/partition_executor.py +1 -1
  47. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  48. datahub/utilities/sql_parser.py +0 -94
  49. datahub/utilities/sql_parser_base.py +0 -21
  50. {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/WHEEL +0 -0
  51. {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/entry_points.txt +0 -0
  52. {acryl_datahub-0.15.0rc15.dist-info → acryl_datahub-0.15.0rc17.dist-info}/top_level.txt +0 -0
@@ -221,7 +221,7 @@ class Pipeline:
221
221
  dry_run: bool = False,
222
222
  preview_mode: bool = False,
223
223
  preview_workunits: int = 10,
224
- report_to: Optional[str] = None,
224
+ report_to: Optional[str] = "datahub",
225
225
  no_progress: bool = False,
226
226
  ):
227
227
  self.config = config
@@ -117,3 +117,9 @@ class PipelineConfig(ConfigModel):
117
117
  config = cls.parse_obj(resolved_dict)
118
118
  config._raw_dict = raw_dict
119
119
  return config
120
+
121
+ def get_raw_dict(self) -> Dict:
122
+ result = self._raw_dict
123
+ if result is None:
124
+ result = self.dict()
125
+ return result
@@ -65,11 +65,11 @@ class DatahubRestSinkConfig(DatahubClientConfig):
65
65
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
66
66
 
67
67
  # These only apply in async modes.
68
- max_threads: int = _DEFAULT_REST_SINK_MAX_THREADS
69
- max_pending_requests: int = 2000
68
+ max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
69
+ max_pending_requests: pydantic.PositiveInt = 2000
70
70
 
71
71
  # Only applies in async batch mode.
72
- max_per_batch: int = 100
72
+ max_per_batch: pydantic.PositiveInt = 100
73
73
 
74
74
 
75
75
  @dataclasses.dataclass
@@ -201,6 +201,10 @@ class ABSSource(StatefulIngestionSourceBase):
201
201
  ).infer_schema(file)
202
202
  elif extension == ".json":
203
203
  fields = json.JsonInferrer().infer_schema(file)
204
+ elif extension == ".jsonl":
205
+ fields = json.JsonInferrer(
206
+ max_rows=self.source_config.max_rows, format="jsonl"
207
+ ).infer_schema(file)
204
208
  elif extension == ".avro":
205
209
  fields = avro.AvroInferrer().infer_schema(file)
206
210
  else:
@@ -153,11 +153,6 @@ class DataHubGcSource(Source):
153
153
  self.truncate_indices()
154
154
  except Exception as e:
155
155
  self.report.failure("While trying to truncate indices ", exc=e)
156
- if self.dataprocess_cleanup:
157
- try:
158
- yield from self.dataprocess_cleanup.get_workunits_internal()
159
- except Exception as e:
160
- self.report.failure("While trying to cleanup data process ", exc=e)
161
156
  if self.soft_deleted_entities_cleanup:
162
157
  try:
163
158
  self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
@@ -170,6 +165,11 @@ class DataHubGcSource(Source):
170
165
  self.execution_request_cleanup.run()
171
166
  except Exception as e:
172
167
  self.report.failure("While trying to cleanup execution request ", exc=e)
168
+ if self.dataprocess_cleanup:
169
+ try:
170
+ yield from self.dataprocess_cleanup.get_workunits_internal()
171
+ except Exception as e:
172
+ self.report.failure("While trying to cleanup data process ", exc=e)
173
173
  yield from []
174
174
 
175
175
  def truncate_indices(self) -> None:
@@ -60,7 +60,7 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
60
60
  description="Query to filter entities",
61
61
  )
62
62
  limit_entities_delete: Optional[int] = Field(
63
- 10000, description="Max number of entities to delete."
63
+ 25000, description="Max number of entities to delete."
64
64
  )
65
65
 
66
66
  runtime_limit_seconds: Optional[int] = Field(
@@ -141,6 +141,10 @@ class KafkaSourceConfig(
141
141
  default=False,
142
142
  description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
143
143
  )
144
+ ingest_schemas_as_entities: bool = pydantic.Field(
145
+ default=False,
146
+ description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
147
+ )
144
148
 
145
149
 
146
150
  def get_kafka_consumer(
@@ -343,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
343
347
  else:
344
348
  self.report.report_dropped(topic)
345
349
 
346
- # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
347
- for subject in self.schema_registry_client.get_subjects():
348
- try:
349
- yield from self._extract_record(
350
- subject, True, topic_detail=None, extra_topic_config=None
351
- )
352
- except Exception as e:
353
- logger.warning(f"Failed to extract subject {subject}", exc_info=True)
354
- self.report.report_warning(
355
- "subject", f"Exception while extracting topic {subject}: {e}"
356
- )
350
+ if self.source_config.ingest_schemas_as_entities:
351
+ # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
352
+ for subject in self.schema_registry_client.get_subjects():
353
+ try:
354
+ yield from self._extract_record(
355
+ subject, True, topic_detail=None, extra_topic_config=None
356
+ )
357
+ except Exception as e:
358
+ logger.warning(
359
+ f"Failed to extract subject {subject}", exc_info=True
360
+ )
361
+ self.report.report_warning(
362
+ "subject", f"Exception while extracting topic {subject}: {e}"
363
+ )
357
364
 
358
365
  def _extract_record(
359
366
  self,
@@ -88,8 +88,7 @@ class LookerFieldContext:
88
88
  for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql):
89
89
  matched_field = upstream_field_match.group(1)
90
90
  # Remove quotes from field names
91
- matched_field = matched_field.replace('"', "").replace("`", "").lower()
92
- column_names.append(matched_field)
91
+ column_names.append(matched_field.replace('"', "").replace("`", "").lower())
93
92
 
94
93
  return column_names
95
94
 
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
25
25
  LookMLSourceReport,
26
26
  )
27
27
  from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
28
+ from datahub.sql_parsing.schema_resolver import match_columns_to_schema
28
29
  from datahub.sql_parsing.sqlglot_lineage import (
29
30
  ColumnLineageInfo,
30
31
  ColumnRef,
31
32
  SqlParsingResult,
32
33
  Urn,
34
+ create_and_cache_schema_resolver,
33
35
  create_lineage_sql_parsed_result,
34
36
  )
35
37
 
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
200
202
  class AbstractViewUpstream(ABC):
201
203
  """
202
204
  Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
203
- For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
205
+ For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
204
206
  """
205
207
 
206
208
  view_context: LookerViewContext
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
236
238
  def create_fields(self) -> List[ViewField]:
237
239
  return [] # it is for the special case
238
240
 
241
+ def create_upstream_column_refs(
242
+ self, upstream_urn: str, downstream_looker_columns: List[str]
243
+ ) -> List[ColumnRef]:
244
+ """
245
+ - **`upstream_urn`**: The URN of the upstream dataset.
246
+
247
+ - **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
248
+
249
+ - This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
250
+ """
251
+ schema_resolver = create_and_cache_schema_resolver(
252
+ platform=self.view_context.view_connection.platform,
253
+ platform_instance=self.view_context.view_connection.platform_instance,
254
+ env=self.view_context.view_connection.platform_env or self.config.env,
255
+ graph=self.ctx.graph,
256
+ )
257
+
258
+ urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
259
+
260
+ if schema_info:
261
+ actual_columns = match_columns_to_schema(
262
+ schema_info, downstream_looker_columns
263
+ )
264
+ else:
265
+ logger.info(
266
+ f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
267
+ )
268
+ actual_columns = [column.lower() for column in downstream_looker_columns]
269
+
270
+ upstream_column_refs: List[ColumnRef] = []
271
+
272
+ for column in actual_columns:
273
+ upstream_column_refs.append(
274
+ ColumnRef(
275
+ column=column,
276
+ table=upstream_urn,
277
+ )
278
+ )
279
+
280
+ return upstream_column_refs
281
+
239
282
 
240
283
  class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
241
284
  """
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
372
415
  # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
373
416
  # referring to upstream table
374
417
  if self._get_upstream_dataset_urn() and not upstreams_column_refs:
375
- upstreams_column_refs = [
376
- ColumnRef(
377
- table=self._get_upstream_dataset_urn()[
378
- 0
379
- ], # 0th index has table of from clause
380
- column=column,
381
- )
382
- for column in field_context.column_name_in_sql_attribute()
383
- ]
418
+ upstreams_column_refs = self.create_upstream_column_refs(
419
+ upstream_urn=self._get_upstream_dataset_urn()[
420
+ 0
421
+ ], # 0th index has table of from clause,
422
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
423
+ )
384
424
 
385
425
  # fix any derived view reference present in urn
386
426
  upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
487
527
  return upstream_column_refs
488
528
 
489
529
  explore_urn: str = self._get_upstream_dataset_urn()[0]
530
+ expected_columns: List[str] = []
490
531
 
491
532
  for column in field_context.column_name_in_sql_attribute():
492
533
  if column in self._get_explore_column_mapping():
493
534
  explore_column: Dict = self._get_explore_column_mapping()[column]
494
- upstream_column_refs.append(
495
- ColumnRef(
496
- column=explore_column.get("field", explore_column[NAME]),
497
- table=explore_urn,
498
- )
535
+ expected_columns.append(
536
+ explore_column.get("field", explore_column[NAME])
499
537
  )
500
538
 
501
- return upstream_column_refs
539
+ return self.create_upstream_column_refs(
540
+ upstream_urn=explore_urn, downstream_looker_columns=expected_columns
541
+ )
502
542
 
503
543
  def get_upstream_dataset_urn(self) -> List[Urn]:
504
544
  return self._get_upstream_dataset_urn()
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
548
588
  def get_upstream_column_ref(
549
589
  self, field_context: LookerFieldContext
550
590
  ) -> List[ColumnRef]:
551
- upstream_column_ref: List[ColumnRef] = []
552
-
553
- for column_name in field_context.column_name_in_sql_attribute():
554
- upstream_column_ref.append(
555
- ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
556
- )
557
-
558
- return upstream_column_ref
591
+ return self.create_upstream_column_refs(
592
+ upstream_urn=self._get_upstream_dataset_urn(),
593
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
594
+ )
559
595
 
560
596
  def get_upstream_dataset_urn(self) -> List[Urn]:
561
597
  return [self._get_upstream_dataset_urn()]
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
609
645
  self, field_context: LookerFieldContext
610
646
  ) -> List[ColumnRef]:
611
647
  upstream_column_ref: List[ColumnRef] = []
648
+
612
649
  if not self._get_upstream_dataset_urn():
613
650
  return upstream_column_ref
614
651
 
615
- for column_name in field_context.column_name_in_sql_attribute():
616
- upstream_column_ref.append(
617
- ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name)
618
- )
619
-
620
- return upstream_column_ref
652
+ return self.create_upstream_column_refs(
653
+ upstream_urn=self._get_upstream_dataset_urn()[0],
654
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
655
+ )
621
656
 
622
657
  def get_upstream_dataset_urn(self) -> List[Urn]:
623
658
  return self._get_upstream_dataset_urn()
@@ -18,7 +18,6 @@ from pydantic import Field, validator
18
18
  from requests.adapters import HTTPAdapter, Retry
19
19
  from requests.exceptions import ConnectionError
20
20
  from requests.models import HTTPBasicAuth, HTTPError
21
- from sqllineage.runner import LineageRunner
22
21
  from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
23
22
 
24
23
  import datahub.emitter.mce_builder as builder
@@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
820
819
  )
821
820
  return None
822
821
 
823
- @lru_cache(maxsize=None)
824
- def _get_source_from_query(self, raw_query: str) -> set:
825
- query = self._replace_definitions(raw_query)
826
- parser = LineageRunner(query)
827
- source_paths = set()
828
- try:
829
- for table in parser.source_tables:
830
- sources = str(table).split(".")
831
- source_schema, source_table = sources[-2], sources[-1]
832
- if source_schema == "<default>":
833
- source_schema = str(self.config.default_schema)
834
-
835
- source_paths.add(f"{source_schema}.{source_table}")
836
- except Exception as e:
837
- self.report.report_failure(
838
- title="Failed to Extract Lineage From Query",
839
- message="Unable to retrieve lineage from Mode query.",
840
- context=f"Query: {raw_query}, Error: {str(e)}",
841
- )
842
-
843
- return source_paths
844
-
845
822
  def _get_datasource_urn(
846
823
  self,
847
824
  platform: str,
@@ -2,7 +2,7 @@ import logging
2
2
  import math
3
3
  import sys
4
4
  from dataclasses import dataclass, field
5
- from typing import Dict, Iterable, List, Optional, Set, Type
5
+ from typing import Dict, Iterable, List, Optional, Set
6
6
 
7
7
  import dateutil.parser as dp
8
8
  from packaging import version
@@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
22
22
  platform_name,
23
23
  support_status,
24
24
  )
25
- from datahub.ingestion.api.registry import import_path
26
25
  from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
27
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
27
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
@@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
39
38
  ChartTypeClass,
40
39
  DashboardInfoClass,
41
40
  )
41
+ from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
42
42
  from datahub.utilities.lossy_collections import LossyDict, LossyList
43
43
  from datahub.utilities.perf_timer import PerfTimer
44
- from datahub.utilities.sql_parser_base import SQLParser
45
44
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
46
45
 
47
46
  logger = logging.getLogger(__name__)
@@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
270
269
  parse_table_names_from_sql: bool = Field(
271
270
  default=False, description="See note below."
272
271
  )
273
- sql_parser: str = Field(
274
- default="datahub.utilities.sql_parser.DefaultSQLParser",
275
- description="custom SQL parser. See note below for details.",
276
- )
277
272
 
278
273
  env: str = Field(
279
274
  default=DEFAULT_ENV,
@@ -354,7 +349,6 @@ class RedashSource(Source):
354
349
  self.api_page_limit = self.config.api_page_limit or math.inf
355
350
 
356
351
  self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
357
- self.sql_parser_path = self.config.sql_parser
358
352
 
359
353
  logger.info(
360
354
  f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
@@ -380,31 +374,6 @@ class RedashSource(Source):
380
374
  config = RedashConfig.parse_obj(config_dict)
381
375
  return cls(ctx, config)
382
376
 
383
- @classmethod
384
- def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
385
- assert "." in sql_parser_path, "sql_parser-path must contain a ."
386
- parser_cls = import_path(sql_parser_path)
387
-
388
- if not issubclass(parser_cls, SQLParser):
389
- raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
390
- return parser_cls
391
-
392
- @classmethod
393
- def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
394
- parser_cls = cls._import_sql_parser_cls(sql_parser_path)
395
-
396
- try:
397
- sql_table_names: List[str] = parser_cls(sql).get_tables()
398
- except Exception as e:
399
- logger.warning(f"Sql parser failed on {sql} with {e}")
400
- return []
401
-
402
- # Remove quotes from table names
403
- sql_table_names = [t.replace('"', "") for t in sql_table_names]
404
- sql_table_names = [t.replace("`", "") for t in sql_table_names]
405
-
406
- return sql_table_names
407
-
408
377
  def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
409
378
  url = f"/api/data_sources/{data_source_id}"
410
379
  resp = self.client._get(url).json()
@@ -441,14 +410,6 @@ class RedashSource(Source):
441
410
 
442
411
  return database_name
443
412
 
444
- def _construct_datalineage_urn(
445
- self, platform: str, database_name: str, sql_table_name: str
446
- ) -> str:
447
- full_dataset_name = get_full_qualified_name(
448
- platform, database_name, sql_table_name
449
- )
450
- return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
451
-
452
413
  def _get_datasource_urns(
453
414
  self, data_source: Dict, sql_query_data: Dict = {}
454
415
  ) -> Optional[List[str]]:
@@ -464,34 +425,23 @@ class RedashSource(Source):
464
425
  # Getting table lineage from SQL parsing
465
426
  if self.parse_table_names_from_sql and data_source_syntax == "sql":
466
427
  dataset_urns = list()
467
- try:
468
- sql_table_names = self._get_sql_table_names(
469
- query, self.sql_parser_path
470
- )
471
- except Exception as e:
428
+ sql_parser_in_tables = create_lineage_sql_parsed_result(
429
+ query=query,
430
+ platform=platform,
431
+ env=self.config.env,
432
+ platform_instance=None,
433
+ default_db=database_name,
434
+ )
435
+ # make sure dataset_urns is not empty list
436
+ dataset_urns = sql_parser_in_tables.in_tables
437
+ if sql_parser_in_tables.debug_info.table_error:
472
438
  self.report.queries_problem_parsing.add(str(query_id))
473
439
  self.error(
474
440
  logger,
475
441
  "sql-parsing",
476
- f"exception {e} in parsing query-{query_id}-datasource-{data_source_id}",
442
+ f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
477
443
  )
478
- sql_table_names = []
479
- for sql_table_name in sql_table_names:
480
- try:
481
- dataset_urns.append(
482
- self._construct_datalineage_urn(
483
- platform, database_name, sql_table_name
484
- )
485
- )
486
- except Exception:
487
- self.report.queries_problem_parsing.add(str(query_id))
488
- self.warn(
489
- logger,
490
- "data-urn-invalid",
491
- f"Problem making URN for {sql_table_name} parsed from query {query_id}",
492
- )
493
444
 
494
- # make sure dataset_urns is not empty list
495
445
  return dataset_urns if len(dataset_urns) > 0 else None
496
446
 
497
447
  else:
@@ -159,6 +159,7 @@ class RedshiftConfig(
159
159
  description="Whether to extract column level lineage. This config works with rest-sink only.",
160
160
  )
161
161
 
162
+ # TODO - use DatasetPropertiesConfigMixin instead
162
163
  patch_custom_properties: bool = Field(
163
164
  default=True,
164
165
  description="Whether to patch custom properties on existing datasets rather than replace.",
@@ -831,6 +831,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
831
831
  customProperties=custom_properties,
832
832
  )
833
833
  if self.config.patch_custom_properties:
834
+ # TODO: use auto_incremental_properties workunit processor instead
835
+ # Deprecate use of patch_custom_properties
834
836
  patch_builder = create_dataset_props_patch_builder(
835
837
  dataset_urn, dataset_properties
836
838
  )
@@ -16,6 +16,9 @@ from datahub.configuration.source_common import (
16
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
17
17
  from datahub.configuration.validate_field_removal import pydantic_removed_field
18
18
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
19
+ from datahub.ingestion.api.incremental_properties_helper import (
20
+ IncrementalPropertiesConfigMixin,
21
+ )
19
22
  from datahub.ingestion.glossary.classification_mixin import (
20
23
  ClassificationSourceConfigMixin,
21
24
  )
@@ -188,6 +191,7 @@ class SnowflakeV2Config(
188
191
  StatefulUsageConfigMixin,
189
192
  StatefulProfilingConfigMixin,
190
193
  ClassificationSourceConfigMixin,
194
+ IncrementalPropertiesConfigMixin,
191
195
  ):
192
196
  include_usage_stats: bool = Field(
193
197
  default=True,
@@ -129,7 +129,9 @@ class SnowflakeQuery:
129
129
  row_count AS "ROW_COUNT",
130
130
  bytes AS "BYTES",
131
131
  clustering_key AS "CLUSTERING_KEY",
132
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
132
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
133
+ is_dynamic AS "IS_DYNAMIC",
134
+ is_iceberg AS "IS_ICEBERG"
133
135
  FROM {db_clause}information_schema.tables t
134
136
  WHERE table_schema != 'INFORMATION_SCHEMA'
135
137
  and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
@@ -149,7 +151,9 @@ class SnowflakeQuery:
149
151
  row_count AS "ROW_COUNT",
150
152
  bytes AS "BYTES",
151
153
  clustering_key AS "CLUSTERING_KEY",
152
- auto_clustering_on AS "AUTO_CLUSTERING_ON"
154
+ auto_clustering_on AS "AUTO_CLUSTERING_ON",
155
+ is_dynamic AS "IS_DYNAMIC",
156
+ is_iceberg AS "IS_ICEBERG"
153
157
  FROM {db_clause}information_schema.tables t
154
158
  where table_schema='{schema_name}'
155
159
  and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
@@ -113,6 +113,7 @@ class SnowflakeV2Report(
113
113
  external_lineage_queries_secs: float = -1
114
114
  num_tables_with_known_upstreams: int = 0
115
115
  num_upstream_lineage_edge_parsing_failed: int = 0
116
+ num_secure_views_missing_definition: int = 0
116
117
 
117
118
  data_dictionary_cache: Optional["SnowflakeDataDictionary"] = None
118
119
 
@@ -90,6 +90,12 @@ class SnowflakeTable(BaseTable):
90
90
  foreign_keys: List[SnowflakeFK] = field(default_factory=list)
91
91
  tags: Optional[List[SnowflakeTag]] = None
92
92
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
93
+ is_dynamic: bool = False
94
+ is_iceberg: bool = False
95
+
96
+ @property
97
+ def is_hybrid(self) -> bool:
98
+ return self.type is not None and self.type == "HYBRID TABLE"
93
99
 
94
100
 
95
101
  @dataclass
@@ -98,6 +104,7 @@ class SnowflakeView(BaseView):
98
104
  columns: List[SnowflakeColumn] = field(default_factory=list)
99
105
  tags: Optional[List[SnowflakeTag]] = None
100
106
  column_tags: Dict[str, List[SnowflakeTag]] = field(default_factory=dict)
107
+ is_secure: bool = False
101
108
 
102
109
 
103
110
  @dataclass
@@ -289,6 +296,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
289
296
  rows_count=table["ROW_COUNT"],
290
297
  comment=table["COMMENT"],
291
298
  clustering_key=table["CLUSTERING_KEY"],
299
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
300
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
292
301
  )
293
302
  )
294
303
  return tables
@@ -313,6 +322,8 @@ class SnowflakeDataDictionary(SupportsAsObj):
313
322
  rows_count=table["ROW_COUNT"],
314
323
  comment=table["COMMENT"],
315
324
  clustering_key=table["CLUSTERING_KEY"],
325
+ is_dynamic=table.get("IS_DYNAMIC", "NO").upper() == "YES",
326
+ is_iceberg=table.get("IS_ICEBERG", "NO").upper() == "YES",
316
327
  )
317
328
  )
318
329
  return tables
@@ -356,6 +367,7 @@ class SnowflakeDataDictionary(SupportsAsObj):
356
367
  materialized=(
357
368
  view.get("is_materialized", "false").lower() == "true"
358
369
  ),
370
+ is_secure=(view.get("is_secure", "false").lower() == "true"),
359
371
  )
360
372
  )
361
373
 
@@ -431,6 +431,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
431
431
  default_db=db_name,
432
432
  default_schema=schema_name,
433
433
  )
434
+ elif view.is_secure:
435
+ self.report.num_secure_views_missing_definition += 1
434
436
 
435
437
  if self.config.include_technical_schema:
436
438
  for view in views:
@@ -749,8 +751,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
749
751
  ) -> DatasetProperties:
750
752
  custom_properties = {}
751
753
 
752
- if isinstance(table, SnowflakeTable) and table.clustering_key:
753
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
754
+ if isinstance(table, SnowflakeTable):
755
+ if table.clustering_key:
756
+ custom_properties["CLUSTERING_KEY"] = table.clustering_key
757
+
758
+ if table.is_hybrid:
759
+ custom_properties["IS_HYBRID"] = "true"
760
+
761
+ if table.is_dynamic:
762
+ custom_properties["IS_DYNAMIC"] = "true"
763
+
764
+ if table.is_iceberg:
765
+ custom_properties["IS_ICEBERG"] = "true"
766
+
767
+ if isinstance(table, SnowflakeView) and table.is_secure:
768
+ custom_properties["IS_SECURE"] = "true"
754
769
 
755
770
  return DatasetProperties(
756
771
  name=table.name,