acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (120) hide show
  1. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
  2. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
  3. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
  5. datahub/__init__.py +1 -1
  6. datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
  7. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  8. datahub/configuration/common.py +2 -5
  9. datahub/configuration/source_common.py +13 -0
  10. datahub/emitter/mce_builder.py +20 -4
  11. datahub/emitter/mcp_builder.py +2 -7
  12. datahub/emitter/mcp_patch_builder.py +37 -13
  13. datahub/emitter/rest_emitter.py +25 -3
  14. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
  15. datahub/ingestion/api/closeable.py +3 -3
  16. datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
  17. datahub/ingestion/api/report.py +4 -1
  18. datahub/ingestion/api/sink.py +4 -3
  19. datahub/ingestion/api/source.py +4 -0
  20. datahub/ingestion/api/source_helpers.py +2 -6
  21. datahub/ingestion/glossary/classifier.py +2 -3
  22. datahub/ingestion/graph/client.py +6 -3
  23. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
  24. datahub/ingestion/source/aws/aws_common.py +231 -27
  25. datahub/ingestion/source/aws/glue.py +12 -2
  26. datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
  27. datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
  28. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
  29. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
  30. datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
  31. datahub/ingestion/source/datahub/config.py +22 -1
  32. datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
  33. datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
  34. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  35. datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
  36. datahub/ingestion/source/gc/datahub_gc.py +21 -5
  37. datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
  38. datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
  39. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
  40. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  41. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  42. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  43. datahub/ingestion/source/kafka_connect/common.py +202 -0
  44. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  45. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  46. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  47. datahub/ingestion/source/looker/looker_common.py +63 -2
  48. datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
  49. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  50. datahub/ingestion/source/looker/looker_source.py +31 -4
  51. datahub/ingestion/source/looker/looker_usage.py +23 -17
  52. datahub/ingestion/source/mlflow.py +30 -5
  53. datahub/ingestion/source/mode.py +40 -27
  54. datahub/ingestion/source/powerbi/config.py +1 -14
  55. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  56. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  57. datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
  58. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
  59. datahub/ingestion/source/s3/source.py +1 -1
  60. datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
  61. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
  62. datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
  63. datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
  64. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
  65. datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
  66. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
  67. datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
  68. datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
  69. datahub/ingestion/source/sql/hive.py +621 -8
  70. datahub/ingestion/source/sql/hive_metastore.py +7 -0
  71. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  72. datahub/ingestion/source/sql/mssql/source.py +15 -1
  73. datahub/ingestion/source/sql/sql_common.py +41 -102
  74. datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
  75. datahub/ingestion/source/sql/sql_report.py +2 -0
  76. datahub/ingestion/source/state/checkpoint.py +2 -1
  77. datahub/ingestion/source/tableau/tableau.py +122 -45
  78. datahub/ingestion/source/tableau/tableau_common.py +18 -0
  79. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  80. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  81. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  82. datahub/ingestion/source/unity/proxy.py +8 -27
  83. datahub/ingestion/source/usage/usage_common.py +15 -1
  84. datahub/ingestion/source_report/ingestion_stage.py +3 -0
  85. datahub/metadata/_schema_classes.py +256 -3
  86. datahub/metadata/_urns/urn_defs.py +168 -168
  87. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
  88. datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
  89. datahub/metadata/schema.avsc +252 -33
  90. datahub/metadata/schemas/DataJobKey.avsc +2 -1
  91. datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
  92. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  93. datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
  94. datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
  95. datahub/metadata/schemas/MLModelProperties.avsc +62 -2
  96. datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
  97. datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
  98. datahub/specific/aspect_helpers/__init__.py +0 -0
  99. datahub/specific/aspect_helpers/custom_properties.py +79 -0
  100. datahub/specific/aspect_helpers/ownership.py +67 -0
  101. datahub/specific/aspect_helpers/structured_properties.py +72 -0
  102. datahub/specific/aspect_helpers/tags.py +42 -0
  103. datahub/specific/aspect_helpers/terms.py +43 -0
  104. datahub/specific/chart.py +28 -184
  105. datahub/specific/dashboard.py +31 -196
  106. datahub/specific/datajob.py +34 -189
  107. datahub/specific/dataproduct.py +24 -86
  108. datahub/specific/dataset.py +48 -133
  109. datahub/specific/form.py +12 -32
  110. datahub/specific/structured_property.py +9 -9
  111. datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
  112. datahub/sql_parsing/sqlglot_lineage.py +15 -5
  113. datahub/sql_parsing/tool_meta_extractor.py +119 -5
  114. datahub/utilities/time.py +8 -3
  115. datahub/utilities/urns/_urn_base.py +5 -7
  116. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  117. datahub/specific/custom_properties.py +0 -37
  118. datahub/specific/ownership.py +0 -48
  119. datahub/specific/structured_properties.py +0 -53
  120. {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
@@ -186,16 +186,16 @@ class LookerModel:
186
186
  f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
187
187
  )
188
188
  if "*" not in inc and not included_files:
189
- reporter.report_failure(
189
+ reporter.warning(
190
190
  title="Error Resolving Include",
191
- message=f"Cannot resolve include {inc}",
192
- context=f"Path: {path}",
191
+ message="Cannot resolve included file",
192
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
193
193
  )
194
194
  elif not included_files:
195
- reporter.report_failure(
195
+ reporter.warning(
196
196
  title="Error Resolving Include",
197
- message=f"Did not resolve anything for wildcard include {inc}",
198
- context=f"Path: {path}",
197
+ message="Did not find anything matching the wildcard include",
198
+ context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
199
199
  )
200
200
  # only load files that we haven't seen so far
201
201
  included_files = [x for x in included_files if x not in seen_so_far]
@@ -231,9 +231,7 @@ class LookerModel:
231
231
  source_config,
232
232
  reporter,
233
233
  seen_so_far,
234
- traversal_path=traversal_path
235
- + "."
236
- + pathlib.Path(included_file).stem,
234
+ traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
237
235
  )
238
236
  )
239
237
  except Exception as e:
@@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel):
68
68
  get_look_calls: int = 0
69
69
  search_looks_calls: int = 0
70
70
  search_dashboards_calls: int = 0
71
+ all_user_calls: int = 0
71
72
 
72
73
 
73
74
  class LookerAPI:
@@ -135,7 +136,7 @@ class LookerAPI:
135
136
 
136
137
  return permissions
137
138
 
138
- @lru_cache(maxsize=1000)
139
+ @lru_cache(maxsize=5000)
139
140
  def get_user(self, id_: str, user_fields: str) -> Optional[User]:
140
141
  self.client_stats.user_calls += 1
141
142
  try:
@@ -154,6 +155,17 @@ class LookerAPI:
154
155
  # User not found
155
156
  return None
156
157
 
158
+ def all_users(self, user_fields: str) -> Sequence[User]:
159
+ self.client_stats.all_user_calls += 1
160
+ try:
161
+ return self.client.all_users(
162
+ fields=cast(str, user_fields),
163
+ transport_options=self.transport_options,
164
+ )
165
+ except SDKError as e:
166
+ logger.warning(f"Failure was {e}")
167
+ return []
168
+
157
169
  def execute_query(self, write_query: WriteQuery) -> List[Dict]:
158
170
  logger.debug(f"Executing query {write_query}")
159
171
  self.client_stats.query_calls += 1
@@ -68,6 +68,7 @@ from datahub.ingestion.source.looker.looker_common import (
68
68
  ViewField,
69
69
  ViewFieldType,
70
70
  gen_model_key,
71
+ get_urn_looker_element_id,
71
72
  )
72
73
  from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig
73
74
  from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
@@ -145,7 +146,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
145
146
  self.source_config: LookerDashboardSourceConfig = config
146
147
  self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
147
148
  self.looker_api: LookerAPI = LookerAPI(self.source_config)
148
- self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
149
+ self.user_registry: LookerUserRegistry = LookerUserRegistry(
150
+ self.looker_api, self.reporter
151
+ )
149
152
  self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
150
153
  self.looker_api, self.reporter, self.source_config
151
154
  )
@@ -163,6 +166,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
163
166
  # Required, as we do not ingest all folders but only those that have dashboards/looks
164
167
  self.processed_folders: List[str] = []
165
168
 
169
+ # Keep track of ingested chart urns, to omit usage for non-ingested entities
170
+ self.chart_urns: Set[str] = set()
171
+
166
172
  @staticmethod
167
173
  def test_connection(config_dict: dict) -> TestConnectionReport:
168
174
  test_report = TestConnectionReport()
@@ -640,6 +646,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
640
646
  chart_urn = self._make_chart_urn(
641
647
  element_id=dashboard_element.get_urn_element_id()
642
648
  )
649
+ self.chart_urns.add(chart_urn)
643
650
  chart_snapshot = ChartSnapshot(
644
651
  urn=chart_urn,
645
652
  aspects=[Status(removed=False)],
@@ -1378,7 +1385,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1378
1385
  yield from self._emit_folder_as_container(folder)
1379
1386
 
1380
1387
  def extract_usage_stat(
1381
- self, looker_dashboards: List[looker_usage.LookerDashboardForUsage]
1388
+ self,
1389
+ looker_dashboards: List[looker_usage.LookerDashboardForUsage],
1390
+ ingested_chart_urns: Set[str],
1382
1391
  ) -> List[MetadataChangeProposalWrapper]:
1383
1392
  looks: List[looker_usage.LookerChartForUsage] = []
1384
1393
  # filter out look from all dashboard
@@ -1389,6 +1398,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1389
1398
 
1390
1399
  # dedup looks
1391
1400
  looks = list({str(look.id): look for look in looks}.values())
1401
+ filtered_looks = []
1402
+ for look in looks:
1403
+ if not look.id:
1404
+ continue
1405
+ chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id))
1406
+ if chart_urn in ingested_chart_urns:
1407
+ filtered_looks.append(look)
1408
+ else:
1409
+ self.reporter.charts_skipped_for_usage.add(look.id)
1392
1410
 
1393
1411
  # Keep stat generators to generate entity stat aspect later
1394
1412
  stat_generator_config: looker_usage.StatGeneratorConfig = (
@@ -1412,7 +1430,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1412
1430
  stat_generator_config,
1413
1431
  self.reporter,
1414
1432
  self._make_chart_urn,
1415
- looks,
1433
+ filtered_looks,
1416
1434
  )
1417
1435
 
1418
1436
  mcps: List[MetadataChangeProposalWrapper] = []
@@ -1667,11 +1685,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1667
1685
  if self.source_config.extract_usage_history:
1668
1686
  self.reporter.report_stage_start("usage_extraction")
1669
1687
  usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat(
1670
- looker_dashboards_for_usage
1688
+ looker_dashboards_for_usage, self.chart_urns
1671
1689
  )
1672
1690
  for usage_mcp in usage_mcps:
1673
1691
  yield usage_mcp.as_workunit()
1674
1692
  self.reporter.report_stage_end("usage_extraction")
1675
1693
 
1694
+ # Dump looker user resource mappings.
1695
+ logger.info("Ingesting looker user resource mapping workunits")
1696
+ self.reporter.report_stage_start("user_resource_extraction")
1697
+ yield from auto_workunit(
1698
+ self.user_registry.to_platform_resource(
1699
+ self.source_config.platform_instance
1700
+ )
1701
+ )
1702
+
1676
1703
  def get_report(self) -> SourceReport:
1677
1704
  return self.reporter
@@ -42,6 +42,7 @@ from datahub.metadata.schema_classes import (
42
42
  TimeWindowSizeClass,
43
43
  _Aspect as AspectAbstract,
44
44
  )
45
+ from datahub.utilities.lossy_collections import LossySet
45
46
 
46
47
  logger = logging.getLogger(__name__)
47
48
 
@@ -170,7 +171,7 @@ class BaseStatGenerator(ABC):
170
171
  self.config = config
171
172
  self.looker_models = looker_models
172
173
  # Later it will help to find out for what are the looker entities from query result
173
- self.id_vs_model: Dict[str, ModelForUsage] = {
174
+ self.id_to_model: Dict[str, ModelForUsage] = {
174
175
  self.get_id(looker_object): looker_object for looker_object in looker_models
175
176
  }
176
177
  self.post_filter = len(self.looker_models) > 100
@@ -225,6 +226,10 @@ class BaseStatGenerator(ABC):
225
226
  def get_id_from_row(self, row: dict) -> str:
226
227
  pass
227
228
 
229
+ @abstractmethod
230
+ def report_skip_set(self) -> LossySet[str]:
231
+ pass
232
+
228
233
  def create_mcp(
229
234
  self, model: ModelForUsage, aspect: Aspect
230
235
  ) -> MetadataChangeProposalWrapper:
@@ -258,20 +263,11 @@ class BaseStatGenerator(ABC):
258
263
 
259
264
  return entity_stat_aspect
260
265
 
261
- def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]:
262
- aspects: List[Tuple[ModelForUsage, AspectAbstract]] = []
263
- for looker_object in self.looker_models:
264
- aspects.append(
265
- (looker_object, self.to_entity_absolute_stat_aspect(looker_object))
266
- )
267
-
268
- return aspects
269
-
270
266
  def _fill_user_stat_aspect(
271
267
  self,
272
268
  entity_usage_stat: Dict[Tuple[str, str], Aspect],
273
269
  user_wise_rows: List[Dict],
274
- ) -> Iterable[Tuple[ModelForUsage, Aspect]]:
270
+ ) -> Iterable[Tuple[str, Aspect]]:
275
271
  logger.debug("Entering fill user stat aspect")
276
272
 
277
273
  # We first resolve all the users using a threadpool to warm up the cache
@@ -300,7 +296,7 @@ class BaseStatGenerator(ABC):
300
296
 
301
297
  for row in user_wise_rows:
302
298
  # Confirm looker object was given for stat generation
303
- looker_object = self.id_vs_model.get(self.get_id_from_row(row))
299
+ looker_object = self.id_to_model.get(self.get_id_from_row(row))
304
300
  if looker_object is None:
305
301
  logger.warning(
306
302
  "Looker object with id({}) was not register with stat generator".format(
@@ -338,7 +334,7 @@ class BaseStatGenerator(ABC):
338
334
  logger.debug("Starting to yield answers for user-wise counts")
339
335
 
340
336
  for (id, _), aspect in entity_usage_stat.items():
341
- yield self.id_vs_model[id], aspect
337
+ yield id, aspect
342
338
 
343
339
  def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]:
344
340
  rows = []
@@ -357,7 +353,7 @@ class BaseStatGenerator(ABC):
357
353
  )
358
354
  if self.post_filter:
359
355
  logger.debug("post filtering")
360
- rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model]
356
+ rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model]
361
357
  logger.debug("Filtered down to %d rows", len(rows))
362
358
  except Exception as e:
363
359
  logger.warning(f"Failed to execute {query_name} query: {e}")
@@ -378,7 +374,8 @@ class BaseStatGenerator(ABC):
378
374
  return
379
375
 
380
376
  # yield absolute stat for looker entities
381
- for looker_object, aspect in self._process_absolute_aspect(): # type: ignore
377
+ for looker_object in self.looker_models:
378
+ aspect = self.to_entity_absolute_stat_aspect(looker_object)
382
379
  yield self.create_mcp(looker_object, aspect)
383
380
 
384
381
  # Execute query and process the raw json which contains stat information
@@ -399,10 +396,13 @@ class BaseStatGenerator(ABC):
399
396
  )
400
397
  user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query")
401
398
  # yield absolute stat for entity
402
- for looker_object, aspect in self._fill_user_stat_aspect(
399
+ for object_id, aspect in self._fill_user_stat_aspect(
403
400
  entity_usage_stat, user_wise_rows
404
401
  ):
405
- yield self.create_mcp(looker_object, aspect)
402
+ if object_id in self.id_to_model:
403
+ yield self.create_mcp(self.id_to_model[object_id], aspect)
404
+ else:
405
+ self.report_skip_set().add(object_id)
406
406
 
407
407
 
408
408
  class DashboardStatGenerator(BaseStatGenerator):
@@ -425,6 +425,9 @@ class DashboardStatGenerator(BaseStatGenerator):
425
425
  def get_stats_generator_name(self) -> str:
426
426
  return "DashboardStats"
427
427
 
428
+ def report_skip_set(self) -> LossySet[str]:
429
+ return self.report.dashboards_skipped_for_usage
430
+
428
431
  def get_filter(self) -> Dict[ViewField, str]:
429
432
  return {
430
433
  HistoryViewField.HISTORY_DASHBOARD_ID: ",".join(
@@ -541,6 +544,9 @@ class LookStatGenerator(BaseStatGenerator):
541
544
  def get_stats_generator_name(self) -> str:
542
545
  return "ChartStats"
543
546
 
547
+ def report_skip_set(self) -> LossySet[str]:
548
+ return self.report.charts_skipped_for_usage
549
+
544
550
  def get_filter(self) -> Dict[ViewField, str]:
545
551
  return {
546
552
  LookViewField.LOOK_ID: ",".join(
@@ -38,16 +38,30 @@ T = TypeVar("T")
38
38
  class MLflowConfig(EnvConfigMixin):
39
39
  tracking_uri: Optional[str] = Field(
40
40
  default=None,
41
- description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)",
41
+ description=(
42
+ "Tracking server URI. If not set, an MLflow default tracking_uri is used"
43
+ " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
44
+ ),
42
45
  )
43
46
  registry_uri: Optional[str] = Field(
44
47
  default=None,
45
- description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)",
48
+ description=(
49
+ "Registry server URI. If not set, an MLflow default registry_uri is used"
50
+ " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
51
+ ),
46
52
  )
47
53
  model_name_separator: str = Field(
48
54
  default="_",
49
55
  description="A string which separates model name from its version (e.g. model_1 or model-1)",
50
56
  )
57
+ base_external_url: Optional[str] = Field(
58
+ default=None,
59
+ description=(
60
+ "Base URL to use when constructing external URLs to MLflow."
61
+ " If not set, tracking_uri is used if it's an HTTP URL."
62
+ " If neither is set, external URLs are not generated."
63
+ ),
64
+ )
51
65
 
52
66
 
53
67
  @dataclass
@@ -279,12 +293,23 @@ class MLflowSource(Source):
279
293
  )
280
294
  return urn
281
295
 
282
- def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]:
296
+ def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
297
+ if isinstance(
298
+ self.client.tracking_uri, str
299
+ ) and self.client.tracking_uri.startswith("http"):
300
+ return self.client.tracking_uri
301
+ else:
302
+ return None
303
+
304
+ def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
283
305
  """
284
306
  Generate URL for a Model Version to MLflow UI.
285
307
  """
286
- base_uri = self.client.tracking_uri
287
- if base_uri.startswith("http"):
308
+ base_uri = (
309
+ self.config.base_external_url
310
+ or self._get_base_external_url_from_tracking_uri()
311
+ )
312
+ if base_uri:
288
313
  return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
289
314
  else:
290
315
  return None
@@ -5,6 +5,7 @@ import time
5
5
  from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
7
  from functools import lru_cache
8
+ from json import JSONDecodeError
8
9
  from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
9
10
 
10
11
  import dateutil.parser as dp
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
98
99
  TagPropertiesClass,
99
100
  UpstreamClass,
100
101
  UpstreamLineageClass,
102
+ ViewPropertiesClass,
101
103
  )
102
104
  from datahub.metadata.urns import QueryUrn
103
105
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -192,6 +194,9 @@ class HTTPError429(HTTPError):
192
194
  pass
193
195
 
194
196
 
197
+ ModeRequestError = (HTTPError, JSONDecodeError)
198
+
199
+
195
200
  @dataclass
196
201
  class ModeSourceReport(StaleEntityRemovalSourceReport):
197
202
  filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList)
@@ -327,11 +332,11 @@ class ModeSource(StatefulIngestionSourceBase):
327
332
  # Test the connection
328
333
  try:
329
334
  self._get_request_json(f"{self.config.connect_uri}/api/verify")
330
- except HTTPError as http_error:
335
+ except ModeRequestError as e:
331
336
  self.report.report_failure(
332
337
  title="Failed to Connect",
333
338
  message="Unable to verify connection to mode.",
334
- context=f"Error: {str(http_error)}",
339
+ context=f"Error: {str(e)}",
335
340
  )
336
341
 
337
342
  self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
@@ -520,11 +525,11 @@ class ModeSource(StatefulIngestionSourceBase):
520
525
  if self.config.owner_username_instead_of_email
521
526
  else user_json.get("email")
522
527
  )
523
- except HTTPError as http_error:
528
+ except ModeRequestError as e:
524
529
  self.report.report_warning(
525
530
  title="Failed to retrieve Mode creator",
526
531
  message=f"Unable to retrieve user for {href}",
527
- context=f"Reason: {str(http_error)}",
532
+ context=f"Reason: {str(e)}",
528
533
  )
529
534
  return user
530
535
 
@@ -570,11 +575,11 @@ class ModeSource(StatefulIngestionSourceBase):
570
575
  logging.debug(f"Skipping space {space_name} due to space pattern")
571
576
  continue
572
577
  space_info[s.get("token", "")] = s.get("name", "")
573
- except HTTPError as http_error:
578
+ except ModeRequestError as e:
574
579
  self.report.report_failure(
575
580
  title="Failed to Retrieve Spaces",
576
581
  message="Unable to retrieve spaces / collections for workspace.",
577
- context=f"Workspace: {self.workspace_uri}, Error: {str(http_error)}",
582
+ context=f"Workspace: {self.workspace_uri}, Error: {str(e)}",
578
583
  )
579
584
 
580
585
  return space_info
@@ -720,11 +725,11 @@ class ModeSource(StatefulIngestionSourceBase):
720
725
  try:
721
726
  ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources")
722
727
  data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
723
- except HTTPError as http_error:
728
+ except ModeRequestError as e:
724
729
  self.report.report_failure(
725
730
  title="Failed to retrieve Data Sources",
726
731
  message="Unable to retrieve data sources from Mode.",
727
- context=f"Error: {str(http_error)}",
732
+ context=f"Error: {str(e)}",
728
733
  )
729
734
 
730
735
  return data_sources
@@ -811,11 +816,11 @@ class ModeSource(StatefulIngestionSourceBase):
811
816
  if definition.get("name", "") == definition_name:
812
817
  return definition.get("source", "")
813
818
 
814
- except HTTPError as http_error:
819
+ except ModeRequestError as e:
815
820
  self.report.report_failure(
816
821
  title="Failed to Retrieve Definition",
817
822
  message="Unable to retrieve definition from Mode.",
818
- context=f"Definition Name: {definition_name}, Error: {str(http_error)}",
823
+ context=f"Definition Name: {definition_name}, Error: {str(e)}",
819
824
  )
820
825
  return None
821
826
 
@@ -930,16 +935,13 @@ class ModeSource(StatefulIngestionSourceBase):
930
935
 
931
936
  dataset_props = DatasetPropertiesClass(
932
937
  name=report_info.get("name") if is_mode_dataset else query_data.get("name"),
933
- description=f"""### Source Code
934
- ``` sql
935
- {query_data.get("raw_query")}
936
- ```
937
- """,
938
+ description=None,
938
939
  externalUrl=externalUrl,
939
940
  customProperties=self.get_custom_props_from_dict(
940
941
  query_data,
941
942
  [
942
- "id" "created_at",
943
+ "id",
944
+ "created_at",
943
945
  "updated_at",
944
946
  "last_run_id",
945
947
  "data_source_id",
@@ -949,7 +951,6 @@ class ModeSource(StatefulIngestionSourceBase):
949
951
  ],
950
952
  ),
951
953
  )
952
-
953
954
  yield (
954
955
  MetadataChangeProposalWrapper(
955
956
  entityUrn=query_urn,
@@ -957,6 +958,16 @@ class ModeSource(StatefulIngestionSourceBase):
957
958
  ).as_workunit()
958
959
  )
959
960
 
961
+ if raw_query := query_data.get("raw_query"):
962
+ yield MetadataChangeProposalWrapper(
963
+ entityUrn=query_urn,
964
+ aspect=ViewPropertiesClass(
965
+ viewLogic=raw_query,
966
+ viewLanguage=QueryLanguageClass.SQL,
967
+ materialized=False,
968
+ ),
969
+ ).as_workunit()
970
+
960
971
  if is_mode_dataset:
961
972
  space_container_key = self.gen_space_key(space_token)
962
973
  yield from add_dataset_to_container(
@@ -1375,11 +1386,11 @@ class ModeSource(StatefulIngestionSourceBase):
1375
1386
  f"{self.workspace_uri}/spaces/{space_token}/reports"
1376
1387
  )
1377
1388
  reports = reports_json.get("_embedded", {}).get("reports", {})
1378
- except HTTPError as http_error:
1389
+ except ModeRequestError as e:
1379
1390
  self.report.report_failure(
1380
1391
  title="Failed to Retrieve Reports for Space",
1381
1392
  message="Unable to retrieve reports for space token.",
1382
- context=f"Space Token: {space_token}, Error: {str(http_error)}",
1393
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1383
1394
  )
1384
1395
  return reports
1385
1396
 
@@ -1393,11 +1404,11 @@ class ModeSource(StatefulIngestionSourceBase):
1393
1404
  url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
1394
1405
  datasets_json = self._get_request_json(url)
1395
1406
  datasets = datasets_json.get("_embedded", {}).get("reports", [])
1396
- except HTTPError as http_error:
1407
+ except ModeRequestError as e:
1397
1408
  self.report.report_failure(
1398
1409
  title="Failed to Retrieve Datasets for Space",
1399
1410
  message=f"Unable to retrieve datasets for space token {space_token}.",
1400
- context=f"Error: {str(http_error)}",
1411
+ context=f"Error: {str(e)}",
1401
1412
  )
1402
1413
  return datasets
1403
1414
 
@@ -1409,11 +1420,11 @@ class ModeSource(StatefulIngestionSourceBase):
1409
1420
  f"{self.workspace_uri}/reports/{report_token}/queries"
1410
1421
  )
1411
1422
  queries = queries_json.get("_embedded", {}).get("queries", {})
1412
- except HTTPError as http_error:
1423
+ except ModeRequestError as e:
1413
1424
  self.report.report_failure(
1414
1425
  title="Failed to Retrieve Queries",
1415
1426
  message="Unable to retrieve queries for report token.",
1416
- context=f"Report Token: {report_token}, Error: {str(http_error)}",
1427
+ context=f"Report Token: {report_token}, Error: {str(e)}",
1417
1428
  )
1418
1429
  return queries
1419
1430
 
@@ -1426,11 +1437,11 @@ class ModeSource(StatefulIngestionSourceBase):
1426
1437
  f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
1427
1438
  )
1428
1439
  queries = queries_json.get("_embedded", {}).get("queries", {})
1429
- except HTTPError as http_error:
1440
+ except ModeRequestError as e:
1430
1441
  self.report.report_failure(
1431
1442
  title="Failed to Retrieve Queries for Report",
1432
1443
  message="Unable to retrieve queries for report token.",
1433
- context=f"Report Token:{report_token}, Error: {str(http_error)}",
1444
+ context=f"Report Token:{report_token}, Error: {str(e)}",
1434
1445
  )
1435
1446
  return {}
1436
1447
  return queries
@@ -1444,13 +1455,13 @@ class ModeSource(StatefulIngestionSourceBase):
1444
1455
  f"/queries/{query_token}/charts"
1445
1456
  )
1446
1457
  charts = charts_json.get("_embedded", {}).get("charts", {})
1447
- except HTTPError as http_error:
1458
+ except ModeRequestError as e:
1448
1459
  self.report.report_failure(
1449
1460
  title="Failed to Retrieve Charts",
1450
1461
  message="Unable to retrieve charts from Mode.",
1451
1462
  context=f"Report Token: {report_token}, "
1452
1463
  f"Query token: {query_token}, "
1453
- f"Error: {str(http_error)}",
1464
+ f"Error: {str(e)}",
1454
1465
  )
1455
1466
  return charts
1456
1467
 
@@ -1470,6 +1481,8 @@ class ModeSource(StatefulIngestionSourceBase):
1470
1481
  response = self.session.get(
1471
1482
  url, timeout=self.config.api_options.timeout
1472
1483
  )
1484
+ if response.status_code == 204: # No content, don't parse json
1485
+ return {}
1473
1486
  return response.json()
1474
1487
  except HTTPError as http_error:
1475
1488
  error_response = http_error.response
@@ -9,7 +9,7 @@ from pydantic.class_validators import root_validator
9
9
 
10
10
  import datahub.emitter.mce_builder as builder
11
11
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
- from datahub.configuration.source_common import DatasetSourceConfigMixin
12
+ from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
13
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
14
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
15
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
232
232
  return dict_
233
233
 
234
234
 
235
- class PlatformDetail(ConfigModel):
236
- platform_instance: Optional[str] = pydantic.Field(
237
- default=None,
238
- description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
239
- "with platform instance name used in ingestion "
240
- "recipe of other datahub sources.",
241
- )
242
- env: str = pydantic.Field(
243
- default=builder.DEFAULT_ENV,
244
- description="The environment that all assets produced by DataHub platform ingestion source belong to",
245
- )
246
-
247
-
248
235
  class DataBricksPlatformDetail(PlatformDetail):
249
236
  """
250
237
  metastore is an additional field used in Databricks connector to generate the dataset urn
@@ -2,8 +2,8 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Union
4
4
 
5
+ from datahub.configuration.source_common import PlatformDetail
5
6
  from datahub.ingestion.source.powerbi.config import (
6
- PlatformDetail,
7
7
  PowerBiDashboardSourceConfig,
8
8
  PowerBIPlatformDetail,
9
9
  )
@@ -5,13 +5,13 @@ from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
6
  from lark import Tree
7
7
 
8
+ from datahub.configuration.source_common import PlatformDetail
8
9
  from datahub.emitter import mce_builder as builder
9
10
  from datahub.ingestion.api.common import PipelineContext
10
11
  from datahub.ingestion.source.powerbi.config import (
11
12
  Constant,
12
13
  DataBricksPlatformDetail,
13
14
  DataPlatformPair,
14
- PlatformDetail,
15
15
  PowerBiDashboardSourceConfig,
16
16
  PowerBiDashboardSourceReport,
17
17
  PowerBIPlatformDetail,
@@ -21,6 +21,11 @@
21
21
  // | empty_string
22
22
  // | empty_string "," argument_list
23
23
  // - Added sql_string in any_literal
24
+ // - Added WS_INLINE? in field expression
25
+ // Added to ignore any comments
26
+ // %ignore WS // Ignore whitespace
27
+ // %ignore CPP_COMMENT // Ignore single-line comments
28
+ // %ignore C_COMMENT // Ignore multi-line comments
24
29
 
25
30
  lexical_unit: lexical_elements?
26
31
 
@@ -245,6 +250,8 @@ operator_or_punctuator: ","
245
250
  | "=>"
246
251
  | ".."
247
252
  | "..."
253
+ | "{{"
254
+ | "}}"
248
255
 
249
256
  document: section_document
250
257
  | expression_document
@@ -275,6 +282,7 @@ expression: logical_or_expression
275
282
  | if_expression
276
283
  | error_raising_expression
277
284
  | error_handling_expression
285
+ | outer_expression
278
286
 
279
287
 
280
288
  logical_or_expression: logical_and_expression
@@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/
376
384
 
377
385
  sql_string: "\"" sql_content "\""
378
386
 
387
+ outer_expression: "{{" expression "}}"
388
+
379
389
  argument_list: WS_INLINE? expression
380
390
  | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list
381
391
  | WS_INLINE? sql_string
@@ -409,7 +419,7 @@ record_expression: "[" field_list? "]"
409
419
  field_list: field
410
420
  | field "," field_list
411
421
 
412
- field: field_name WS_INLINE? "=" WS_INLINE? expression
422
+ field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression
413
423
 
414
424
  field_name: generalized_identifier
415
425
  | quoted_identifier
@@ -621,4 +631,8 @@ any_literal: record_literal
621
631
  %import common.DIGIT
622
632
  %import common.LF
623
633
  %import common.CR
624
- %import common.ESCAPED_STRING
634
+ %import common.ESCAPED_STRING
635
+
636
+ %ignore WS // Ignore whitespace
637
+ %ignore CPP_COMMENT // Ignore single-line comments
638
+ %ignore C_COMMENT // Ignore multi-line comments