acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -141,6 +141,10 @@ class KafkaSourceConfig(
141
141
  default=False,
142
142
  description="Disables the utilization of the TopicRecordNameStrategy for Schema Registry subjects. For more information, visit: https://docs.confluent.io/platform/current/schema-registry/serdes-develop/index.html#handling-differences-between-preregistered-and-client-derived-schemas:~:text=io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
143
143
  )
144
+ ingest_schemas_as_entities: bool = pydantic.Field(
145
+ default=False,
146
+ description="Enables ingesting schemas from schema registry as separate entities, in addition to the topics",
147
+ )
144
148
 
145
149
 
146
150
  def get_kafka_consumer(
@@ -148,7 +152,7 @@ def get_kafka_consumer(
148
152
  ) -> confluent_kafka.Consumer:
149
153
  consumer = confluent_kafka.Consumer(
150
154
  {
151
- "group.id": "test",
155
+ "group.id": "datahub-kafka-ingestion",
152
156
  "bootstrap.servers": connection.bootstrap,
153
157
  **connection.consumer_config,
154
158
  }
@@ -164,6 +168,25 @@ def get_kafka_consumer(
164
168
  return consumer
165
169
 
166
170
 
171
+ def get_kafka_admin_client(
172
+ connection: KafkaConsumerConnectionConfig,
173
+ ) -> AdminClient:
174
+ client = AdminClient(
175
+ {
176
+ "group.id": "datahub-kafka-ingestion",
177
+ "bootstrap.servers": connection.bootstrap,
178
+ **connection.consumer_config,
179
+ }
180
+ )
181
+ if CallableConsumerConfig.is_callable_config(connection.consumer_config):
182
+ # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
183
+ # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
184
+ logger.debug("Initiating polling for kafka admin client")
185
+ client.poll(timeout=30)
186
+ logger.debug("Initiated polling for kafka admin client")
187
+ return client
188
+
189
+
167
190
  @dataclass
168
191
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
169
192
  topics_scanned: int = 0
@@ -278,13 +301,7 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
278
301
  def init_kafka_admin_client(self) -> None:
279
302
  try:
280
303
  # TODO: Do we require separate config than existing consumer_config ?
281
- self.admin_client = AdminClient(
282
- {
283
- "group.id": "test",
284
- "bootstrap.servers": self.source_config.connection.bootstrap,
285
- **self.source_config.connection.consumer_config,
286
- }
287
- )
304
+ self.admin_client = get_kafka_admin_client(self.source_config.connection)
288
305
  except Exception as e:
289
306
  logger.debug(e, exc_info=e)
290
307
  self.report.report_warning(
@@ -330,17 +347,20 @@ class KafkaSource(StatefulIngestionSourceBase, TestableSource):
330
347
  else:
331
348
  self.report.report_dropped(topic)
332
349
 
333
- # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
334
- for subject in self.schema_registry_client.get_subjects():
335
- try:
336
- yield from self._extract_record(
337
- subject, True, topic_detail=None, extra_topic_config=None
338
- )
339
- except Exception as e:
340
- logger.warning(f"Failed to extract subject {subject}", exc_info=True)
341
- self.report.report_warning(
342
- "subject", f"Exception while extracting topic {subject}: {e}"
343
- )
350
+ if self.source_config.ingest_schemas_as_entities:
351
+ # Get all subjects from schema registry and ingest them as SCHEMA DatasetSubTypes
352
+ for subject in self.schema_registry_client.get_subjects():
353
+ try:
354
+ yield from self._extract_record(
355
+ subject, True, topic_detail=None, extra_topic_config=None
356
+ )
357
+ except Exception as e:
358
+ logger.warning(
359
+ f"Failed to extract subject {subject}", exc_info=True
360
+ )
361
+ self.report.report_warning(
362
+ "subject", f"Exception while extracting topic {subject}: {e}"
363
+ )
344
364
 
345
365
  def _extract_record(
346
366
  self,
@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
282
282
  query: str
283
283
  transforms: list
284
284
 
285
- def report_warning(self, key: str, reason: str) -> None:
286
- logger.warning(f"{key}: {reason}")
287
- self.report.report_warning(key, reason)
288
-
289
285
  def get_parser(
290
286
  self,
291
287
  connector_manifest: ConnectorManifest,
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
355
351
  source_table = f"{table_name_tuple[-2]}.{source_table}"
356
352
  else:
357
353
  include_source_dataset = False
358
- self.report_warning(
359
- self.connector_manifest.name,
360
- f"could not find schema for table {source_table}",
354
+ self.report.warning(
355
+ "Could not find schema for table"
356
+ f"{self.connector_manifest.name} : {source_table}",
361
357
  )
362
358
  dataset_name: str = get_dataset_name(database_name, source_table)
363
359
  lineage = KafkaConnectLineage(
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
457
453
  target_platform=KAFKA,
458
454
  )
459
455
  lineages.append(lineage)
460
- self.report_warning(
456
+ self.report.warning(
457
+ "Could not find input dataset, the connector has query configuration set",
461
458
  self.connector_manifest.name,
462
- "could not find input dataset, the connector has query configuration set",
463
459
  )
464
460
  self.connector_manifest.lineages = lineages
465
461
  return
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
535
531
  include_source_dataset=False,
536
532
  )
537
533
  )
538
- self.report_warning(
539
- self.connector_manifest.name,
540
- f"could not find input dataset, for connector topics {topic_names}",
534
+ self.report.warning(
535
+ "Could not find input dataset for connector topics",
536
+ f"{self.connector_manifest.name} : {topic_names}",
541
537
  )
542
538
  self.connector_manifest.lineages = lineages
543
539
  return
544
540
  else:
545
541
  include_source_dataset = True
546
542
  if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
547
- self.report_warning(
548
- self.connector_manifest.name,
549
- f"could not find input dataset, connector has unknown transform - {transforms[0]['type']}",
543
+ self.report.warning(
544
+ "Could not find input dataset, connector has unknown transform",
545
+ f"{self.connector_manifest.name} : {transforms[0]['type']}",
550
546
  )
551
547
  include_source_dataset = False
552
548
  if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
553
- self.report_warning(
549
+ self.report.warning(
550
+ "Could not find input dataset, connector has one or more unknown transforms",
554
551
  self.connector_manifest.name,
555
- "could not find input dataset, connector has one or more unknown transforms",
556
552
  )
557
553
  include_source_dataset = False
558
554
  lineages = self.default_get_lineages(
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
753
749
  lineages.append(lineage)
754
750
  self.connector_manifest.lineages = lineages
755
751
  except Exception as e:
756
- self.report.report_warning(
757
- self.connector_manifest.name, f"Error resolving lineage: {e}"
752
+ self.report.warning(
753
+ "Error resolving lineage for connector",
754
+ self.connector_manifest.name,
755
+ exc=e,
758
756
  )
759
757
 
760
758
  return
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
783
781
  defaultDataset: Optional[str] = None
784
782
  version: str = "v1"
785
783
 
786
- def report_warning(self, key: str, reason: str) -> None:
787
- logger.warning(f"{key}: {reason}")
788
- self.report.report_warning(key, reason)
789
-
790
784
  def get_parser(
791
785
  self,
792
786
  connector_manifest: ConnectorManifest,
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
917
911
  transformed_topic = self.apply_transformations(topic, transforms)
918
912
  dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
919
913
  if dataset_table is None:
920
- self.report_warning(
921
- self.connector_manifest.name,
922
- f"could not find target dataset for topic {transformed_topic}, please check your connector configuration",
914
+ self.report.warning(
915
+ "Could not find target dataset for topic, please check your connector configuration"
916
+ f"{self.connector_manifest.name} : {transformed_topic} ",
923
917
  )
924
918
  continue
925
919
  target_dataset = f"{project}.{dataset_table}"
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
954
948
  schema_name: str
955
949
  topics_to_tables: Dict[str, str]
956
950
 
957
- def report_warning(self, key: str, reason: str) -> None:
958
- logger.warning(f"{key}: {reason}")
959
- self.report.report_warning(key, reason)
960
-
961
951
  def get_table_name_from_topic_name(self, topic_name: str) -> str:
962
952
  """
963
953
  This function converts the topic name to a valid Snowflake table name using some rules.
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
1105
1095
  )
1106
1096
  self.connector_manifest.lineages = lineages
1107
1097
  except Exception as e:
1108
- self.report.report_warning(
1109
- self.connector_manifest.name, f"Error resolving lineage: {e}"
1098
+ self.report.warning(
1099
+ "Error resolving lineage for connector",
1100
+ self.connector_manifest.name,
1101
+ exc=e,
1110
1102
  )
1111
1103
 
1112
1104
  return
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1155
1147
  )
1156
1148
  self.session.auth = (self.config.username, self.config.password)
1157
1149
 
1158
- test_response = self.session.get(f"{self.config.connect_uri}")
1150
+ test_response = self.session.get(f"{self.config.connect_uri}/connectors")
1159
1151
  test_response.raise_for_status()
1160
1152
  logger.info(f"Connection to {self.config.connect_uri} is ok")
1161
1153
  if not jpype.isJVMStarted():
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1178
1170
 
1179
1171
  payload = connector_response.json()
1180
1172
 
1181
- for c in payload:
1182
- connector_url = f"{self.config.connect_uri}/connectors/{c}"
1183
- connector_response = self.session.get(connector_url)
1184
- manifest = connector_response.json()
1185
- connector_manifest = ConnectorManifest(**manifest)
1186
- if not self.config.connector_patterns.allowed(connector_manifest.name):
1187
- self.report.report_dropped(connector_manifest.name)
1173
+ for connector_name in payload:
1174
+ connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
1175
+ connector_manifest = self._get_connector_manifest(
1176
+ connector_name, connector_url
1177
+ )
1178
+ if (
1179
+ connector_manifest is None
1180
+ or not self.config.connector_patterns.allowed(connector_manifest.name)
1181
+ ):
1182
+ self.report.report_dropped(connector_name)
1188
1183
  continue
1189
1184
 
1190
1185
  if self.config.provided_configs:
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1195
1190
  connector_manifest.lineages = list()
1196
1191
  connector_manifest.url = connector_url
1197
1192
 
1198
- topics = self.session.get(
1199
- f"{self.config.connect_uri}/connectors/{c}/topics",
1200
- ).json()
1201
-
1202
- connector_manifest.topic_names = topics[c]["topics"]
1193
+ connector_manifest.topic_names = self._get_connector_topics(connector_name)
1203
1194
 
1204
1195
  # Populate Source Connector metadata
1205
1196
  if connector_manifest.type == SOURCE:
1206
- tasks = self.session.get(
1207
- f"{self.config.connect_uri}/connectors/{c}/tasks",
1208
- ).json()
1209
-
1210
- connector_manifest.tasks = tasks
1197
+ connector_manifest.tasks = self._get_connector_tasks(connector_name)
1211
1198
 
1212
1199
  # JDBC source connector lineages
1213
1200
  if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1246
1233
  )
1247
1234
  continue
1248
1235
 
1249
- for topic in topics:
1236
+ for topic in connector_manifest.topic_names:
1250
1237
  lineage = KafkaConnectLineage(
1251
1238
  source_dataset=target_connector.source_dataset,
1252
1239
  source_platform=target_connector.source_platform,
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
1286
1273
 
1287
1274
  return connectors_manifest
1288
1275
 
1276
+ def _get_connector_manifest(
1277
+ self, connector_name: str, connector_url: str
1278
+ ) -> Optional[ConnectorManifest]:
1279
+ try:
1280
+ connector_response = self.session.get(connector_url)
1281
+ connector_response.raise_for_status()
1282
+ except Exception as e:
1283
+ self.report.warning(
1284
+ "Failed to get connector details", connector_name, exc=e
1285
+ )
1286
+ return None
1287
+ manifest = connector_response.json()
1288
+ connector_manifest = ConnectorManifest(**manifest)
1289
+ return connector_manifest
1290
+
1291
+ def _get_connector_tasks(self, connector_name: str) -> dict:
1292
+ try:
1293
+ response = self.session.get(
1294
+ f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
1295
+ )
1296
+ response.raise_for_status()
1297
+ except Exception as e:
1298
+ self.report.warning(
1299
+ "Error getting connector tasks", context=connector_name, exc=e
1300
+ )
1301
+ return {}
1302
+
1303
+ return response.json()
1304
+
1305
+ def _get_connector_topics(self, connector_name: str) -> List[str]:
1306
+ try:
1307
+ response = self.session.get(
1308
+ f"{self.config.connect_uri}/connectors/{connector_name}/topics",
1309
+ )
1310
+ response.raise_for_status()
1311
+ except Exception as e:
1312
+ self.report.warning(
1313
+ "Error getting connector topics", context=connector_name, exc=e
1314
+ )
1315
+ return []
1316
+
1317
+ return response.json()[connector_name]["topics"]
1318
+
1289
1319
  def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
1290
1320
  connector_name = connector.name
1291
1321
  connector_type = connector.type
@@ -4,6 +4,7 @@ from typing import ClassVar, Optional, TextIO
4
4
  from liquid import Environment
5
5
  from liquid.ast import Node
6
6
  from liquid.context import Context
7
+ from liquid.filter import string_filter
7
8
  from liquid.parse import expect, get_parser
8
9
  from liquid.stream import TokenStream
9
10
  from liquid.tag import Tag
@@ -81,12 +82,18 @@ class ConditionTag(Tag):
81
82
  custom_tags = [ConditionTag]
82
83
 
83
84
 
85
+ @string_filter
86
+ def sql_quote_filter(variable: str) -> str:
87
+ return f"'{variable}'"
88
+
89
+
84
90
  @lru_cache(maxsize=1)
85
91
  def _create_env() -> Environment:
86
- env: Environment = Environment()
92
+ env: Environment = Environment(strict_filters=False)
87
93
  # register tag. One time activity
88
94
  for custom_tag in custom_tags:
89
95
  env.add_tag(custom_tag)
96
+ env.add_filter("sql_quote", sql_quote_filter)
90
97
  return env
91
98
 
92
99
 
@@ -88,8 +88,7 @@ class LookerFieldContext:
88
88
  for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql):
89
89
  matched_field = upstream_field_match.group(1)
90
90
  # Remove quotes from field names
91
- matched_field = matched_field.replace('"', "").replace("`", "").lower()
92
- column_names.append(matched_field)
91
+ column_names.append(matched_field.replace('"', "").replace("`", "").lower())
93
92
 
94
93
  return column_names
95
94
 
@@ -25,11 +25,13 @@ from datahub.ingestion.source.looker.lookml_config import (
25
25
  LookMLSourceReport,
26
26
  )
27
27
  from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name
28
+ from datahub.sql_parsing.schema_resolver import match_columns_to_schema
28
29
  from datahub.sql_parsing.sqlglot_lineage import (
29
30
  ColumnLineageInfo,
30
31
  ColumnRef,
31
32
  SqlParsingResult,
32
33
  Urn,
34
+ create_and_cache_schema_resolver,
33
35
  create_lineage_sql_parsed_result,
34
36
  )
35
37
 
@@ -200,7 +202,7 @@ def _generate_fully_qualified_name(
200
202
  class AbstractViewUpstream(ABC):
201
203
  """
202
204
  Implementation of this interface extracts the view upstream as per the way the view is bound to datasets.
203
- For detail explanation please refer lookml_concept_context.LookerViewContext documentation.
205
+ For detail explanation, please refer lookml_concept_context.LookerViewContext documentation.
204
206
  """
205
207
 
206
208
  view_context: LookerViewContext
@@ -236,6 +238,47 @@ class AbstractViewUpstream(ABC):
236
238
  def create_fields(self) -> List[ViewField]:
237
239
  return [] # it is for the special case
238
240
 
241
+ def create_upstream_column_refs(
242
+ self, upstream_urn: str, downstream_looker_columns: List[str]
243
+ ) -> List[ColumnRef]:
244
+ """
245
+ - **`upstream_urn`**: The URN of the upstream dataset.
246
+
247
+ - **`expected_columns`**: These are the columns identified by the Looker connector as belonging to the `upstream_urn` dataset. However, there is potential for human error in specifying the columns of the upstream dataset. For example, a user might declare a column in lowercase, while on the actual platform, it may exist in uppercase, or vice versa.
248
+
249
+ - This function ensures consistency in column-level lineage by consulting GMS before creating the final `ColumnRef` instance, avoiding discrepancies.
250
+ """
251
+ schema_resolver = create_and_cache_schema_resolver(
252
+ platform=self.view_context.view_connection.platform,
253
+ platform_instance=self.view_context.view_connection.platform_instance,
254
+ env=self.view_context.view_connection.platform_env or self.config.env,
255
+ graph=self.ctx.graph,
256
+ )
257
+
258
+ urn, schema_info = schema_resolver.resolve_urn(urn=upstream_urn)
259
+
260
+ if schema_info:
261
+ actual_columns = match_columns_to_schema(
262
+ schema_info, downstream_looker_columns
263
+ )
264
+ else:
265
+ logger.info(
266
+ f"schema_info not found for dataset {urn} in GMS. Using expected_columns to form ColumnRef"
267
+ )
268
+ actual_columns = [column.lower() for column in downstream_looker_columns]
269
+
270
+ upstream_column_refs: List[ColumnRef] = []
271
+
272
+ for column in actual_columns:
273
+ upstream_column_refs.append(
274
+ ColumnRef(
275
+ column=column,
276
+ table=upstream_urn,
277
+ )
278
+ )
279
+
280
+ return upstream_column_refs
281
+
239
282
 
240
283
  class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
241
284
  """
@@ -372,15 +415,12 @@ class SqlBasedDerivedViewUpstream(AbstractViewUpstream, ABC):
372
415
  # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is
373
416
  # referring to upstream table
374
417
  if self._get_upstream_dataset_urn() and not upstreams_column_refs:
375
- upstreams_column_refs = [
376
- ColumnRef(
377
- table=self._get_upstream_dataset_urn()[
378
- 0
379
- ], # 0th index has table of from clause
380
- column=column,
381
- )
382
- for column in field_context.column_name_in_sql_attribute()
383
- ]
418
+ upstreams_column_refs = self.create_upstream_column_refs(
419
+ upstream_urn=self._get_upstream_dataset_urn()[
420
+ 0
421
+ ], # 0th index has table of from clause,
422
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
423
+ )
384
424
 
385
425
  # fix any derived view reference present in urn
386
426
  upstreams_column_refs = resolve_derived_view_urn_of_col_ref(
@@ -487,18 +527,18 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
487
527
  return upstream_column_refs
488
528
 
489
529
  explore_urn: str = self._get_upstream_dataset_urn()[0]
530
+ expected_columns: List[str] = []
490
531
 
491
532
  for column in field_context.column_name_in_sql_attribute():
492
533
  if column in self._get_explore_column_mapping():
493
534
  explore_column: Dict = self._get_explore_column_mapping()[column]
494
- upstream_column_refs.append(
495
- ColumnRef(
496
- column=explore_column.get("field", explore_column[NAME]),
497
- table=explore_urn,
498
- )
535
+ expected_columns.append(
536
+ explore_column.get("field", explore_column[NAME])
499
537
  )
500
538
 
501
- return upstream_column_refs
539
+ return self.create_upstream_column_refs(
540
+ upstream_urn=explore_urn, downstream_looker_columns=expected_columns
541
+ )
502
542
 
503
543
  def get_upstream_dataset_urn(self) -> List[Urn]:
504
544
  return self._get_upstream_dataset_urn()
@@ -548,14 +588,10 @@ class RegularViewUpstream(AbstractViewUpstream):
548
588
  def get_upstream_column_ref(
549
589
  self, field_context: LookerFieldContext
550
590
  ) -> List[ColumnRef]:
551
- upstream_column_ref: List[ColumnRef] = []
552
-
553
- for column_name in field_context.column_name_in_sql_attribute():
554
- upstream_column_ref.append(
555
- ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name)
556
- )
557
-
558
- return upstream_column_ref
591
+ return self.create_upstream_column_refs(
592
+ upstream_urn=self._get_upstream_dataset_urn(),
593
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
594
+ )
559
595
 
560
596
  def get_upstream_dataset_urn(self) -> List[Urn]:
561
597
  return [self._get_upstream_dataset_urn()]
@@ -609,15 +645,14 @@ class DotSqlTableNameViewUpstream(AbstractViewUpstream):
609
645
  self, field_context: LookerFieldContext
610
646
  ) -> List[ColumnRef]:
611
647
  upstream_column_ref: List[ColumnRef] = []
648
+
612
649
  if not self._get_upstream_dataset_urn():
613
650
  return upstream_column_ref
614
651
 
615
- for column_name in field_context.column_name_in_sql_attribute():
616
- upstream_column_ref.append(
617
- ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name)
618
- )
619
-
620
- return upstream_column_ref
652
+ return self.create_upstream_column_refs(
653
+ upstream_urn=self._get_upstream_dataset_urn()[0],
654
+ downstream_looker_columns=field_context.column_name_in_sql_attribute(),
655
+ )
621
656
 
622
657
  def get_upstream_dataset_urn(self) -> List[Urn]:
623
658
  return self._get_upstream_dataset_urn()
@@ -45,6 +45,9 @@ class Owners(ConfigModel):
45
45
  groups: Optional[List[str]] = None
46
46
 
47
47
 
48
+ OwnersMultipleTypes = Union[List[Owners], Owners]
49
+
50
+
48
51
  class KnowledgeCard(ConfigModel):
49
52
  url: Optional[str] = None
50
53
  label: Optional[str] = None
@@ -57,7 +60,7 @@ class GlossaryTermConfig(ConfigModel):
57
60
  term_source: Optional[str] = None
58
61
  source_ref: Optional[str] = None
59
62
  source_url: Optional[str] = None
60
- owners: Optional[Owners] = None
63
+ owners: Optional[OwnersMultipleTypes] = None
61
64
  inherits: Optional[List[str]] = None
62
65
  contains: Optional[List[str]] = None
63
66
  values: Optional[List[str]] = None
@@ -74,7 +77,7 @@ class GlossaryNodeConfig(ConfigModel):
74
77
  id: Optional[str] = None
75
78
  name: str
76
79
  description: str
77
- owners: Optional[Owners] = None
80
+ owners: Optional[OwnersMultipleTypes] = None
78
81
  terms: Optional[List["GlossaryTermConfig"]] = None
79
82
  nodes: Optional[List["GlossaryNodeConfig"]] = None
80
83
  knowledge_links: Optional[List[KnowledgeCard]] = None
@@ -88,7 +91,7 @@ class DefaultConfig(ConfigModel):
88
91
  """Holds defaults for populating fields in glossary terms"""
89
92
 
90
93
  source: Optional[str] = None
91
- owners: Owners
94
+ owners: OwnersMultipleTypes
92
95
  url: Optional[str] = None
93
96
  source_type: str = "INTERNAL"
94
97
 
@@ -153,30 +156,44 @@ def make_glossary_term_urn(
153
156
  return "urn:li:glossaryTerm:" + create_id(path, default_id, enable_auto_id)
154
157
 
155
158
 
156
- def get_owners(owners: Owners) -> models.OwnershipClass:
157
- ownership_type, ownership_type_urn = validate_ownership_type(owners.type)
159
+ def get_owners_multiple_types(owners: OwnersMultipleTypes) -> models.OwnershipClass:
160
+ """Allows owner types to be a list and maintains backward compatibility"""
161
+ if isinstance(owners, Owners):
162
+ return models.OwnershipClass(owners=list(get_owners(owners)))
163
+
164
+ owners_meta: List[models.OwnerClass] = []
165
+ for owner in owners:
166
+ owners_meta.extend(get_owners(owner))
167
+
168
+ return models.OwnershipClass(owners=owners_meta)
169
+
170
+
171
+ def get_owners(owners: Owners) -> Iterable[models.OwnerClass]:
172
+ actual_type = owners.type or models.OwnershipTypeClass.DEVELOPER
173
+
174
+ if actual_type.startswith("urn:li:ownershipType:"):
175
+ ownership_type: str = "CUSTOM"
176
+ ownership_type_urn: Optional[str] = actual_type
177
+ else:
178
+ ownership_type, ownership_type_urn = validate_ownership_type(actual_type)
179
+
158
180
  if owners.typeUrn is not None:
159
181
  ownership_type_urn = owners.typeUrn
160
- owners_meta: List[models.OwnerClass] = []
182
+
161
183
  if owners.users is not None:
162
- owners_meta = owners_meta + [
163
- models.OwnerClass(
184
+ for o in owners.users:
185
+ yield models.OwnerClass(
164
186
  owner=make_user_urn(o),
165
187
  type=ownership_type,
166
188
  typeUrn=ownership_type_urn,
167
189
  )
168
- for o in owners.users
169
- ]
170
190
  if owners.groups is not None:
171
- owners_meta = owners_meta + [
172
- models.OwnerClass(
191
+ for o in owners.groups:
192
+ yield models.OwnerClass(
173
193
  owner=make_group_urn(o),
174
194
  type=ownership_type,
175
195
  typeUrn=ownership_type_urn,
176
196
  )
177
- for o in owners.groups
178
- ]
179
- return models.OwnershipClass(owners=owners_meta)
180
197
 
181
198
 
182
199
  def get_mces(
@@ -185,7 +202,7 @@ def get_mces(
185
202
  ingestion_config: BusinessGlossarySourceConfig,
186
203
  ctx: PipelineContext,
187
204
  ) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]:
188
- root_owners = get_owners(glossary.owners)
205
+ root_owners = get_owners_multiple_types(glossary.owners)
189
206
 
190
207
  if glossary.nodes:
191
208
  for node in glossary.nodes:
@@ -270,7 +287,7 @@ def get_mces_from_node(
270
287
  node_owners = parentOwners
271
288
  if glossaryNode.owners is not None:
272
289
  assert glossaryNode.owners is not None
273
- node_owners = get_owners(glossaryNode.owners)
290
+ node_owners = get_owners_multiple_types(glossaryNode.owners)
274
291
 
275
292
  node_snapshot = models.GlossaryNodeSnapshotClass(
276
293
  urn=node_urn,
@@ -426,7 +443,7 @@ def get_mces_from_term(
426
443
  ownership: models.OwnershipClass = parentOwnership
427
444
  if glossaryTerm.owners is not None:
428
445
  assert glossaryTerm.owners is not None
429
- ownership = get_owners(glossaryTerm.owners)
446
+ ownership = get_owners_multiple_types(glossaryTerm.owners)
430
447
  aspects.append(ownership)
431
448
 
432
449
  if glossaryTerm.domain is not None: