acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (133) hide show
  1. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
  2. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
  3. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/common/subtypes.py +2 -0
  35. datahub/ingestion/source/csv_enricher.py +1 -1
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  37. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  38. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  39. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  40. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  41. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  42. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  43. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  44. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  45. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  46. datahub/ingestion/source/elastic_search.py +1 -1
  47. datahub/ingestion/source/feast.py +97 -6
  48. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  49. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  50. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  51. datahub/ingestion/source/ge_data_profiler.py +23 -1
  52. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  53. datahub/ingestion/source/kafka/kafka.py +39 -19
  54. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  55. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  56. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  57. datahub/ingestion/source/looker/view_upstream.py +65 -30
  58. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  59. datahub/ingestion/source/mode.py +0 -23
  60. datahub/ingestion/source/neo4j/__init__.py +0 -0
  61. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  62. datahub/ingestion/source/powerbi/__init__.py +0 -1
  63. datahub/ingestion/source/powerbi/config.py +3 -3
  64. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  65. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  66. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  67. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  68. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  69. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  70. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  71. datahub/ingestion/source/preset.py +1 -0
  72. datahub/ingestion/source/pulsar.py +21 -2
  73. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  74. datahub/ingestion/source/redash.py +13 -63
  75. datahub/ingestion/source/redshift/config.py +1 -0
  76. datahub/ingestion/source/redshift/redshift.py +3 -0
  77. datahub/ingestion/source/s3/source.py +2 -3
  78. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  79. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  80. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  81. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  82. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  83. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  84. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  85. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  86. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  87. datahub/ingestion/source/sql/athena.py +46 -22
  88. datahub/ingestion/source/sql/mssql/source.py +0 -2
  89. datahub/ingestion/source/sql/sql_common.py +34 -21
  90. datahub/ingestion/source/sql/sql_report.py +1 -0
  91. datahub/ingestion/source/sql/sql_types.py +85 -8
  92. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  93. datahub/ingestion/source/superset.py +215 -65
  94. datahub/ingestion/source/tableau/tableau.py +237 -76
  95. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  96. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  97. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  98. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  99. datahub/ingestion/source/unity/proxy_types.py +1 -0
  100. datahub/ingestion/source/unity/source.py +4 -0
  101. datahub/ingestion/source/unity/usage.py +20 -11
  102. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  103. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  104. datahub/integrations/assertion/common.py +1 -1
  105. datahub/lite/duckdb_lite.py +12 -17
  106. datahub/metadata/_schema_classes.py +512 -392
  107. datahub/metadata/_urns/urn_defs.py +1355 -1355
  108. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  109. datahub/metadata/schema.avsc +17222 -17499
  110. datahub/metadata/schemas/FormInfo.avsc +4 -0
  111. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  112. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  113. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  114. datahub/specific/chart.py +0 -39
  115. datahub/specific/dashboard.py +0 -39
  116. datahub/specific/datajob.py +7 -57
  117. datahub/sql_parsing/schema_resolver.py +23 -0
  118. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  119. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  120. datahub/sql_parsing/sqlglot_utils.py +8 -2
  121. datahub/telemetry/telemetry.py +23 -9
  122. datahub/testing/compare_metadata_json.py +1 -1
  123. datahub/testing/doctest.py +12 -0
  124. datahub/utilities/file_backed_collections.py +35 -2
  125. datahub/utilities/partition_executor.py +1 -1
  126. datahub/utilities/urn_encoder.py +2 -1
  127. datahub/utilities/urns/_urn_base.py +1 -1
  128. datahub/utilities/urns/structured_properties_urn.py +1 -1
  129. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  130. datahub/utilities/sql_parser.py +0 -94
  131. datahub/utilities/sql_parser_base.py +0 -21
  132. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  133. {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -103,6 +103,7 @@ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecuto
103
103
  logger = logging.getLogger(__name__)
104
104
 
105
105
  # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
106
+ # TODO: Move to the standardized types in sql_types.py
106
107
  SNOWFLAKE_FIELD_TYPE_MAPPINGS = {
107
108
  "DATE": DateType,
108
109
  "BIGINT": NumberType,
@@ -423,6 +424,10 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
423
424
  view_identifier = self.identifiers.get_dataset_identifier(
424
425
  view.name, schema_name, db_name
425
426
  )
427
+ if view.is_secure and not view.view_definition:
428
+ view.view_definition = self.fetch_secure_view_definition(
429
+ view.name, schema_name, db_name
430
+ )
426
431
  if view.view_definition:
427
432
  self.aggregator.add_view_definition(
428
433
  view_urn=self.identifiers.gen_dataset_urn(view_identifier),
@@ -430,6 +435,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
430
435
  default_db=db_name,
431
436
  default_schema=schema_name,
432
437
  )
438
+ elif view.is_secure:
439
+ self.report.num_secure_views_missing_definition += 1
433
440
 
434
441
  if self.config.include_technical_schema:
435
442
  for view in views:
@@ -446,6 +453,25 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
446
453
  context=f"{db_name}.{schema_name}",
447
454
  )
448
455
 
456
+ def fetch_secure_view_definition(
457
+ self, table_name: str, schema_name: str, db_name: str
458
+ ) -> Optional[str]:
459
+ try:
460
+ view_definitions = self.data_dictionary.get_secure_view_definitions()
461
+ return view_definitions[db_name][schema_name][table_name]
462
+ except Exception as e:
463
+ if isinstance(e, SnowflakePermissionError):
464
+ error_msg = (
465
+ "Failed to get secure views definitions. Please check permissions."
466
+ )
467
+ else:
468
+ error_msg = "Failed to get secure views definitions"
469
+ self.structured_reporter.warning(
470
+ error_msg,
471
+ exc=e,
472
+ )
473
+ return None
474
+
449
475
  def fetch_views_for_schema(
450
476
  self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
451
477
  ) -> List[SnowflakeView]:
@@ -748,8 +774,21 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
748
774
  ) -> DatasetProperties:
749
775
  custom_properties = {}
750
776
 
751
- if isinstance(table, SnowflakeTable) and table.clustering_key:
752
- custom_properties["CLUSTERING_KEY"] = table.clustering_key
777
+ if isinstance(table, SnowflakeTable):
778
+ if table.clustering_key:
779
+ custom_properties["CLUSTERING_KEY"] = table.clustering_key
780
+
781
+ if table.is_hybrid:
782
+ custom_properties["IS_HYBRID"] = "true"
783
+
784
+ if table.is_dynamic:
785
+ custom_properties["IS_DYNAMIC"] = "true"
786
+
787
+ if table.is_iceberg:
788
+ custom_properties["IS_ICEBERG"] = "true"
789
+
790
+ if isinstance(table, SnowflakeView) and table.is_secure:
791
+ custom_properties["IS_SECURE"] = "true"
753
792
 
754
793
  return DatasetProperties(
755
794
  name=table.name,
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
  from functools import cached_property
3
- from typing import ClassVar, Literal, Optional, Tuple
3
+ from typing import ClassVar, List, Literal, Optional, Tuple
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
6
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
@@ -119,7 +119,6 @@ class SnowflakeFilter:
119
119
  ) -> bool:
120
120
  if not dataset_type or not dataset_name:
121
121
  return True
122
- dataset_params = dataset_name.split(".")
123
122
  if dataset_type.lower() not in (
124
123
  SnowflakeObjectDomain.TABLE,
125
124
  SnowflakeObjectDomain.EXTERNAL_TABLE,
@@ -131,6 +130,7 @@ class SnowflakeFilter:
131
130
  if _is_sys_table(dataset_name):
132
131
  return False
133
132
 
133
+ dataset_params = _split_qualified_name(dataset_name)
134
134
  if len(dataset_params) != 3:
135
135
  self.structured_reporter.info(
136
136
  title="Unexpected dataset pattern",
@@ -184,6 +184,46 @@ def _is_sys_table(table_name: str) -> bool:
184
184
  return table_name.lower().startswith("sys$")
185
185
 
186
186
 
187
+ def _split_qualified_name(qualified_name: str) -> List[str]:
188
+ """
189
+ Split a qualified name into its constituent parts.
190
+
191
+ >>> _split_qualified_name("db.my_schema.my_table")
192
+ ['db', 'my_schema', 'my_table']
193
+ >>> _split_qualified_name('"db"."my_schema"."my_table"')
194
+ ['db', 'my_schema', 'my_table']
195
+ >>> _split_qualified_name('TEST_DB.TEST_SCHEMA."TABLE.WITH.DOTS"')
196
+ ['TEST_DB', 'TEST_SCHEMA', 'TABLE.WITH.DOTS']
197
+ >>> _split_qualified_name('TEST_DB."SCHEMA.WITH.DOTS".MY_TABLE')
198
+ ['TEST_DB', 'SCHEMA.WITH.DOTS', 'MY_TABLE']
199
+ """
200
+
201
+ # Fast path - no quotes.
202
+ if '"' not in qualified_name:
203
+ return qualified_name.split(".")
204
+
205
+ # First pass - split on dots that are not inside quotes.
206
+ in_quote = False
207
+ parts: List[List[str]] = [[]]
208
+ for char in qualified_name:
209
+ if char == '"':
210
+ in_quote = not in_quote
211
+ elif char == "." and not in_quote:
212
+ parts.append([])
213
+ else:
214
+ parts[-1].append(char)
215
+
216
+ # Second pass - remove outer pairs of quotes.
217
+ result = []
218
+ for part in parts:
219
+ if len(part) > 2 and part[0] == '"' and part[-1] == '"':
220
+ part = part[1:-1]
221
+
222
+ result.append("".join(part))
223
+
224
+ return result
225
+
226
+
187
227
  # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers,
188
228
  # For example "test-database"."test-schema".test_table
189
229
  # whereas we generate urns without quotes even for quoted identifiers for backward compatibility
@@ -192,7 +232,7 @@ def _is_sys_table(table_name: str) -> bool:
192
232
  def _cleanup_qualified_name(
193
233
  qualified_name: str, structured_reporter: SourceReport
194
234
  ) -> str:
195
- name_parts = qualified_name.split(".")
235
+ name_parts = _split_qualified_name(qualified_name)
196
236
  if len(name_parts) != 3:
197
237
  if not _is_sys_table(qualified_name):
198
238
  structured_reporter.info(
@@ -203,9 +243,9 @@ def _cleanup_qualified_name(
203
243
  )
204
244
  return qualified_name.replace('"', "")
205
245
  return _combine_identifier_parts(
206
- db_name=name_parts[0].strip('"'),
207
- schema_name=name_parts[1].strip('"'),
208
- table_name=name_parts[2].strip('"'),
246
+ db_name=name_parts[0],
247
+ schema_name=name_parts[1],
248
+ table_name=name_parts[2],
209
249
  )
210
250
 
211
251
 
@@ -17,6 +17,9 @@ from datahub.ingestion.api.decorators import (
17
17
  support_status,
18
18
  )
19
19
  from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage
20
+ from datahub.ingestion.api.incremental_properties_helper import (
21
+ auto_incremental_properties,
22
+ )
20
23
  from datahub.ingestion.api.source import (
21
24
  CapabilityReport,
22
25
  MetadataWorkUnitProcessor,
@@ -446,6 +449,9 @@ class SnowflakeV2Source(
446
449
  functools.partial(
447
450
  auto_incremental_lineage, self.config.incremental_lineage
448
451
  ),
452
+ functools.partial(
453
+ auto_incremental_properties, self.config.incremental_properties
454
+ ),
449
455
  StaleEntityRemovalHandler.create(
450
456
  self, self.config, self.ctx
451
457
  ).workunit_processor,
@@ -26,6 +26,7 @@ from datahub.ingestion.api.decorators import (
26
26
  platform_name,
27
27
  support_status,
28
28
  )
29
+ from datahub.ingestion.api.source import StructuredLogLevel
29
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
30
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
31
32
  from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
@@ -35,6 +36,7 @@ from datahub.ingestion.source.sql.sql_common import (
35
36
  register_custom_type,
36
37
  )
37
38
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri
39
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
38
40
  from datahub.ingestion.source.sql.sql_utils import (
39
41
  add_table_to_schema_container,
40
42
  gen_database_container,
@@ -48,6 +50,15 @@ from datahub.utilities.sqlalchemy_type_converter import (
48
50
  get_schema_fields_for_sqlalchemy_column,
49
51
  )
50
52
 
53
+ try:
54
+ from typing_extensions import override
55
+ except ImportError:
56
+ _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any])
57
+
58
+ def override(f: _F, /) -> _F: # noqa: F811
59
+ return f
60
+
61
+
51
62
  logger = logging.getLogger(__name__)
52
63
 
53
64
  assert STRUCT, "required type modules are not available"
@@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource):
322
333
  - Profiling when enabled.
323
334
  """
324
335
 
325
- table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
336
+ config: AthenaConfig
337
+ report: SQLSourceReport
326
338
 
327
339
  def __init__(self, config, ctx):
328
340
  super().__init__(config, ctx, "athena")
329
341
  self.cursor: Optional[BaseCursor] = None
330
342
 
343
+ self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {}
344
+
331
345
  @classmethod
332
346
  def create(cls, config_dict, ctx):
333
347
  config = AthenaConfig.parse_obj(config_dict)
@@ -452,6 +466,7 @@ class AthenaSource(SQLAlchemySource):
452
466
  )
453
467
 
454
468
  # It seems like database/schema filter in the connection string does not work and this to work around that
469
+ @override
455
470
  def get_schema_names(self, inspector: Inspector) -> List[str]:
456
471
  athena_config = typing.cast(AthenaConfig, self.config)
457
472
  schemas = inspector.get_schema_names()
@@ -459,34 +474,42 @@ class AthenaSource(SQLAlchemySource):
459
474
  return [schema for schema in schemas if schema == athena_config.database]
460
475
  return schemas
461
476
 
462
- # Overwrite to get partitions
477
+ @classmethod
478
+ def _casted_partition_key(cls, key: str) -> str:
479
+ # We need to cast the partition keys to a VARCHAR, since otherwise
480
+ # Athena may throw an error during concatenation / comparison.
481
+ return f"CAST({key} as VARCHAR)"
482
+
483
+ @override
463
484
  def get_partitions(
464
485
  self, inspector: Inspector, schema: str, table: str
465
- ) -> List[str]:
466
- partitions = []
467
-
468
- athena_config = typing.cast(AthenaConfig, self.config)
469
-
470
- if not athena_config.extract_partitions:
471
- return []
486
+ ) -> Optional[List[str]]:
487
+ if not self.config.extract_partitions:
488
+ return None
472
489
 
473
490
  if not self.cursor:
474
- return []
491
+ return None
475
492
 
476
493
  metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
477
494
  table_name=table, schema_name=schema
478
495
  )
479
496
 
480
- if metadata.partition_keys:
481
- for key in metadata.partition_keys:
482
- if key.name:
483
- partitions.append(key.name)
484
-
485
- if not partitions:
486
- return []
497
+ partitions = []
498
+ for key in metadata.partition_keys:
499
+ if key.name:
500
+ partitions.append(key.name)
501
+ if not partitions:
502
+ return []
487
503
 
488
- # We create an artiificaial concatenated partition key to be able to query max partition easier
489
- part_concat = "|| '-' ||".join(partitions)
504
+ with self.report.report_exc(
505
+ message="Failed to extract partition details",
506
+ context=f"{schema}.{table}",
507
+ level=StructuredLogLevel.WARN,
508
+ ):
509
+ # We create an artifical concatenated partition key to be able to query max partition easier
510
+ part_concat = " || '-' || ".join(
511
+ self._casted_partition_key(key) for key in partitions
512
+ )
490
513
  max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
491
514
  ret = self.cursor.execute(max_partition_query)
492
515
  max_partition: Dict[str, str] = {}
@@ -500,9 +523,8 @@ class AthenaSource(SQLAlchemySource):
500
523
  partitions=partitions,
501
524
  max_partition=max_partition,
502
525
  )
503
- return partitions
504
526
 
505
- return []
527
+ return partitions
506
528
 
507
529
  # Overwrite to modify the creation of schema fields
508
530
  def get_schema_fields_for_column(
@@ -551,7 +573,9 @@ class AthenaSource(SQLAlchemySource):
551
573
  if partition and partition.max_partition:
552
574
  max_partition_filters = []
553
575
  for key, value in partition.max_partition.items():
554
- max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'")
576
+ max_partition_filters.append(
577
+ f"{self._casted_partition_key(key)} = '{value}'"
578
+ )
555
579
  max_partition = str(partition.max_partition)
556
580
  return (
557
581
  max_partition,
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
5
5
 
6
6
  import pydantic
7
7
  import sqlalchemy.dialects.mssql
8
-
9
- # This import verifies that the dependencies are available.
10
8
  from pydantic.fields import Field
11
9
  from sqlalchemy import create_engine, inspect
12
10
  from sqlalchemy.engine.base import Connection
@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
582
582
  generate_operations=False,
583
583
  )
584
584
  for dataset_name in self._view_definition_cache.keys():
585
+ # TODO: Ensure that the lineage generated from the view definition
586
+ # matches the dataset_name.
585
587
  view_definition = self._view_definition_cache[dataset_name]
586
588
  result = self._run_sql_parser(
587
589
  dataset_name,
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1059
1061
  exc=e,
1060
1062
  )
1061
1063
 
1064
+ def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
1065
+ try:
1066
+ view_definition = inspector.get_view_definition(view, schema)
1067
+ if view_definition is None:
1068
+ view_definition = ""
1069
+ else:
1070
+ # Some dialects return a TextClause instead of a raw string,
1071
+ # so we need to convert them to a string.
1072
+ view_definition = str(view_definition)
1073
+ except NotImplementedError:
1074
+ view_definition = ""
1075
+
1076
+ return view_definition
1077
+
1062
1078
  def _process_view(
1063
1079
  self,
1064
1080
  dataset_name: str,
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1077
1093
  columns = inspector.get_columns(view, schema)
1078
1094
  except KeyError:
1079
1095
  # For certain types of views, we are unable to fetch the list of columns.
1080
- self.warn(logger, dataset_name, "unable to get schema for this view")
1096
+ self.report.warning(
1097
+ message="Unable to get schema for a view",
1098
+ context=f"{dataset_name}",
1099
+ )
1081
1100
  schema_metadata = None
1082
1101
  else:
1083
1102
  schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1091
1110
  if self._save_schema_to_resolver():
1092
1111
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1093
1112
  self.discovered_datasets.add(dataset_name)
1113
+
1094
1114
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1095
- try:
1096
- view_definition = inspector.get_view_definition(view, schema)
1097
- if view_definition is None:
1098
- view_definition = ""
1099
- else:
1100
- # Some dialects return a TextClause instead of a raw string,
1101
- # so we need to convert them to a string.
1102
- view_definition = str(view_definition)
1103
- except NotImplementedError:
1104
- view_definition = ""
1105
- properties["view_definition"] = view_definition
1106
1115
  properties["is_view"] = "True"
1116
+
1117
+ view_definition = self._get_view_definition(inspector, schema, view)
1118
+ properties["view_definition"] = view_definition
1107
1119
  if view_definition and self.config.include_view_lineage:
1108
1120
  self._view_definition_cache[dataset_name] = view_definition
1109
1121
 
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1135
1147
  entityUrn=dataset_urn,
1136
1148
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
1137
1149
  ).as_workunit()
1138
- if "view_definition" in properties:
1139
- view_definition_string = properties["view_definition"]
1140
- view_properties_aspect = ViewPropertiesClass(
1141
- materialized=False, viewLanguage="SQL", viewLogic=view_definition_string
1142
- )
1143
- yield MetadataChangeProposalWrapper(
1144
- entityUrn=dataset_urn,
1145
- aspect=view_properties_aspect,
1146
- ).as_workunit()
1150
+
1151
+ view_properties_aspect = ViewPropertiesClass(
1152
+ materialized=False, viewLanguage="SQL", viewLogic=view_definition
1153
+ )
1154
+ yield MetadataChangeProposalWrapper(
1155
+ entityUrn=dataset_urn,
1156
+ aspect=view_properties_aspect,
1157
+ ).as_workunit()
1147
1158
 
1148
1159
  if self.config.domain and self.domain_registry:
1149
1160
  yield from get_domain_wu(
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1197
1208
  )
1198
1209
  else:
1199
1210
  self.report.num_view_definitions_parsed += 1
1211
+ if raw_lineage.out_tables != [view_urn]:
1212
+ self.report.num_view_definitions_view_urn_mismatch += 1
1200
1213
  return view_definition_lineage_helper(raw_lineage, view_urn)
1201
1214
 
1202
1215
  def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
@@ -48,6 +48,7 @@ class SQLSourceReport(
48
48
  query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
49
49
 
50
50
  num_view_definitions_parsed: int = 0
51
+ num_view_definitions_view_urn_mismatch: int = 0
51
52
  num_view_definitions_failed_parsing: int = 0
52
53
  num_view_definitions_failed_column_parsing: int = 0
53
54
  view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
@@ -1,5 +1,5 @@
1
1
  import re
2
- from typing import Any, Dict, ValuesView
2
+ from typing import Any, Dict, Optional, Type, Union, ValuesView
3
3
 
4
4
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
5
5
  ArrayType,
@@ -16,14 +16,28 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
16
16
  UnionType,
17
17
  )
18
18
 
19
- # these can be obtained by running `select format_type(oid, null),* from pg_type;`
20
- # we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
21
- # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
19
+ DATAHUB_FIELD_TYPE = Union[
20
+ ArrayType,
21
+ BooleanType,
22
+ BytesType,
23
+ DateType,
24
+ EnumType,
25
+ MapType,
26
+ NullType,
27
+ NumberType,
28
+ RecordType,
29
+ StringType,
30
+ TimeType,
31
+ UnionType,
32
+ ]
22
33
 
23
- # we map from format_type since this is what dbt uses
24
- # see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
25
34
 
26
- # see https://www.npgsql.org/dev/types.html for helpful type annotations
35
+ # These can be obtained by running `select format_type(oid, null),* from pg_type;`
36
+ # We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
37
+ # (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV)
38
+ # We map from format_type since this is what dbt uses.
39
+ # See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22
40
+ # See https://www.npgsql.org/dev/types.html for helpful type annotations
27
41
  POSTGRES_TYPES_MAP: Dict[str, Any] = {
28
42
  "boolean": BooleanType,
29
43
  "bytea": BytesType,
@@ -262,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
262
276
  return VERTICA_SQL_TYPES_MAP[type_string]
263
277
 
264
278
 
265
- # see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
266
279
  SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
267
280
  "NUMBER": NumberType,
268
281
  "DECIMAL": NumberType,
@@ -298,6 +311,18 @@ SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
298
311
  "GEOGRAPHY": None,
299
312
  }
300
313
 
314
+
315
+ def resolve_snowflake_modified_type(type_string: str) -> Any:
316
+ # Match types with precision and scale, e.g., 'DECIMAL(38,0)'
317
+ match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
318
+ if match:
319
+ modified_type_base = match.group(1) # Extract the base type
320
+ return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
321
+
322
+ # Fallback for types without precision/scale
323
+ return SNOWFLAKE_TYPES_MAP.get(type_string, None)
324
+
325
+
301
326
  # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
302
327
  BIGQUERY_TYPES_MAP: Dict[str, Any] = {
303
328
  "STRING": StringType,
@@ -366,6 +391,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
366
391
  "row": RecordType,
367
392
  "map": MapType,
368
393
  "array": ArrayType,
394
+ "json": RecordType,
369
395
  }
370
396
 
371
397
  # https://docs.aws.amazon.com/athena/latest/ug/data-types.html
@@ -430,3 +456,54 @@ VERTICA_SQL_TYPES_MAP: Dict[str, Any] = {
430
456
  "geography": None,
431
457
  "uuid": StringType,
432
458
  }
459
+
460
+
461
+ _merged_mapping = {
462
+ "boolean": BooleanType,
463
+ "date": DateType,
464
+ "time": TimeType,
465
+ "numeric": NumberType,
466
+ "text": StringType,
467
+ "timestamp with time zone": DateType,
468
+ "timestamp without time zone": DateType,
469
+ "integer": NumberType,
470
+ "float8": NumberType,
471
+ "struct": RecordType,
472
+ **POSTGRES_TYPES_MAP,
473
+ **SNOWFLAKE_TYPES_MAP,
474
+ **BIGQUERY_TYPES_MAP,
475
+ **SPARK_SQL_TYPES_MAP,
476
+ **TRINO_SQL_TYPES_MAP,
477
+ **ATHENA_SQL_TYPES_MAP,
478
+ **VERTICA_SQL_TYPES_MAP,
479
+ }
480
+
481
+
482
+ def resolve_sql_type(
483
+ column_type: Optional[str],
484
+ platform: Optional[str] = None,
485
+ ) -> Optional[DATAHUB_FIELD_TYPE]:
486
+ # In theory, we should use the platform-specific mapping where available.
487
+ # However, the types don't ever conflict, so the merged mapping is fine.
488
+ TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = (
489
+ _merged_mapping.get(column_type) if column_type else None
490
+ )
491
+
492
+ if TypeClass is None and column_type:
493
+ # resolve a modified type
494
+ if platform == "trino":
495
+ TypeClass = resolve_trino_modified_type(column_type)
496
+ elif platform == "athena":
497
+ TypeClass = resolve_athena_modified_type(column_type)
498
+ elif platform == "postgres" or platform == "redshift":
499
+ # Redshift uses a variant of Postgres, so we can use the same logic.
500
+ TypeClass = resolve_postgres_modified_type(column_type)
501
+ elif platform == "vertica":
502
+ TypeClass = resolve_vertica_modified_type(column_type)
503
+ elif platform == "snowflake":
504
+ # Snowflake types are uppercase, so we check that.
505
+ TypeClass = resolve_snowflake_modified_type(column_type.upper())
506
+
507
+ if TypeClass:
508
+ return TypeClass()
509
+ return None
@@ -69,7 +69,7 @@ class RedundantRunSkipHandler(
69
69
  platform: Optional[str] = None
70
70
  source_class = type(self.source)
71
71
  if hasattr(source_class, "get_platform_name"):
72
- platform = source_class.get_platform_name() # type: ignore
72
+ platform = source_class.get_platform_name()
73
73
 
74
74
  # Default name for everything else
75
75
  job_name_suffix = self.get_job_name_suffix()