acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -471,7 +471,10 @@ def get_view_file_path(
471
471
  logger.debug("Entered")
472
472
 
473
473
  for field in lkml_fields:
474
- if field.view == view_name:
474
+ if (
475
+ LookerUtil.extract_view_name_from_lookml_model_explore_field(field)
476
+ == view_name
477
+ ):
475
478
  # This path is relative to git clone directory
476
479
  logger.debug(f"Found view({view_name}) file-path {field.source_file}")
477
480
  return field.source_file
@@ -1103,7 +1106,7 @@ class LookerExplore:
1103
1106
  [column_ref] if column_ref is not None else []
1104
1107
  )
1105
1108
 
1106
- return cls(
1109
+ looker_explore = cls(
1107
1110
  name=explore_name,
1108
1111
  model_name=model,
1109
1112
  project_name=explore.project_name,
@@ -1121,6 +1124,8 @@ class LookerExplore:
1121
1124
  source_file=explore.source_file,
1122
1125
  tags=list(explore.tags) if explore.tags is not None else [],
1123
1126
  )
1127
+ logger.debug(f"Created LookerExplore from API: {looker_explore}")
1128
+ return looker_explore
1124
1129
  except SDKError as e:
1125
1130
  if "<title>Looker Not Found (404)</title>" in str(e):
1126
1131
  logger.info(
@@ -1161,6 +1166,9 @@ class LookerExplore:
1161
1166
  dataset_name = config.explore_naming_pattern.replace_variables(
1162
1167
  self.get_mapping(config)
1163
1168
  )
1169
+ logger.debug(
1170
+ f"Generated dataset_name={dataset_name} for explore with model_name={self.model_name}, name={self.name}"
1171
+ )
1164
1172
 
1165
1173
  return builder.make_dataset_urn_with_platform_instance(
1166
1174
  platform=config.platform_name,
@@ -1362,6 +1370,7 @@ class LookerExploreRegistry:
1362
1370
 
1363
1371
  @lru_cache(maxsize=200)
1364
1372
  def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
1373
+ logger.debug(f"Retrieving explore: model={model}, explore={explore}")
1365
1374
  looker_explore = LookerExplore.from_api(
1366
1375
  model,
1367
1376
  explore,
@@ -1369,6 +1378,12 @@ class LookerExploreRegistry:
1369
1378
  self.report,
1370
1379
  self.source_config,
1371
1380
  )
1381
+ if looker_explore is not None:
1382
+ logger.debug(
1383
+ f"Found explore with model_name={looker_explore.model_name}, name={looker_explore.name}"
1384
+ )
1385
+ else:
1386
+ logger.debug(f"No explore found for model={model}, explore={explore}")
1372
1387
  return looker_explore
1373
1388
 
1374
1389
  def compute_stats(self) -> Dict:
@@ -113,7 +113,7 @@ class LookerAPI:
113
113
  )
114
114
  except SDKError as e:
115
115
  raise ConfigurationError(
116
- f"Failed to connect/authenticate with looker - check your configuration: {e}"
116
+ "Failed to connect/authenticate with looker - check your configuration"
117
117
  ) from e
118
118
 
119
119
  self.client_stats = LookerAPIStats()
@@ -279,6 +279,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
279
279
  return []
280
280
  result = []
281
281
 
282
+ if query is not None:
283
+ logger.debug(
284
+ f"Processing query: model={query.model}, view={query.view}, input_fields_count={len(query.fields) if query.fields else 0}"
285
+ )
286
+
282
287
  # query.dynamic_fields can contain:
283
288
  # - looker table calculations: https://docs.looker.com/exploring-data/using-table-calculations
284
289
  # - looker custom measures: https://docs.looker.com/de/exploring-data/adding-fields/custom-measure
@@ -399,9 +404,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
399
404
  # Get the explore from the view directly
400
405
  explores = [element.query.view] if element.query.view is not None else []
401
406
  logger.debug(
402
- f"Element {element.title}: Explores added via query: {explores}"
407
+ f"Dashboard element {element.title} (ID: {element.id}): Upstream explores added via query={explores} with model={element.query.model}, explore={element.query.view}"
403
408
  )
404
409
  for exp in explores:
410
+ logger.debug(
411
+ f"Adding reachable explore: model={element.query.model}, explore={exp}, element_id={element.id}, title={element.title}"
412
+ )
405
413
  self.add_reachable_explore(
406
414
  model=element.query.model,
407
415
  explore=exp,
@@ -477,12 +485,10 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
477
485
 
478
486
  # Failing the above two approaches, pick out details from result_maker
479
487
  elif element.result_maker is not None:
480
- model: str = ""
481
488
  input_fields = []
482
489
 
483
490
  explores = []
484
491
  if element.result_maker.query is not None:
485
- model = element.result_maker.query.model
486
492
  if element.result_maker.query.view is not None:
487
493
  explores.append(element.result_maker.query.view)
488
494
  input_fields = self._get_input_fields_from_query(
@@ -502,9 +508,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
502
508
 
503
509
  # In addition to the query, filters can point to fields as well
504
510
  assert element.result_maker.filterables is not None
511
+
512
+ # Different dashboard elements my reference explores from different models
513
+ # so we need to create a mapping of explore names to their models to maintain correct associations
514
+ explore_to_model_map = {}
515
+
505
516
  for filterable in element.result_maker.filterables:
506
517
  if filterable.view is not None and filterable.model is not None:
507
- model = filterable.model
518
+ # Store the model for this view/explore in our mapping
519
+ explore_to_model_map[filterable.view] = filterable.model
508
520
  explores.append(filterable.view)
509
521
  self.add_reachable_explore(
510
522
  model=filterable.model,
@@ -527,6 +539,18 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
527
539
 
528
540
  explores = sorted(list(set(explores))) # dedup the list of views
529
541
 
542
+ logger.debug(
543
+ f"Dashboard element {element.id} and their explores with the corresponding model: {explore_to_model_map}"
544
+ )
545
+
546
+ # If we have a query, use its model as the default for any explores that don't have a model in our mapping
547
+ default_model = ""
548
+ if (
549
+ element.result_maker.query is not None
550
+ and element.result_maker.query.model is not None
551
+ ):
552
+ default_model = element.result_maker.query.model
553
+
530
554
  return LookerDashboardElement(
531
555
  id=element.id,
532
556
  title=element.title if element.title is not None else "",
@@ -540,7 +564,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
540
564
  else ""
541
565
  ),
542
566
  upstream_explores=[
543
- LookerExplore(model_name=model, name=exp) for exp in explores
567
+ LookerExplore(
568
+ model_name=explore_to_model_map.get(exp, default_model),
569
+ name=exp,
570
+ )
571
+ for exp in explores
544
572
  ],
545
573
  input_fields=input_fields,
546
574
  owner=None,
@@ -1270,6 +1298,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
1270
1298
  chart_urn = self._make_chart_urn(
1271
1299
  element_id=dashboard_element.get_urn_element_id()
1272
1300
  )
1301
+
1273
1302
  input_fields_aspect = InputFieldsClass(
1274
1303
  fields=self._input_fields_from_dashboard_element(dashboard_element)
1275
1304
  )
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
497
497
  f"Failed to find a project name for model {model_name}"
498
498
  )
499
499
  return model.project_name
500
- except SDKError:
500
+ except SDKError as e:
501
+ self.reporter.failure(
502
+ title="Failed to find a project name for model",
503
+ message="Consider configuring a static project name in your config file",
504
+ context=str(dict(model_name=model_name)),
505
+ exc=e,
506
+ )
501
507
  raise ValueError(
502
508
  f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
503
509
  f"in your config file"
@@ -36,6 +36,7 @@ from datahub.ingestion.api.source_helpers import (
36
36
  )
37
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
38
  from datahub.ingestion.graph.client import get_default_graph
39
+ from datahub.ingestion.graph.config import ClientMode
39
40
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
40
41
  FineGrainedLineageDownstreamType,
41
42
  FineGrainedLineageUpstreamType,
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
210
211
 
211
212
  # extract the old lineage and save it for the new mcp
212
213
  if preserve_upstream:
213
- client = get_default_graph()
214
+ client = get_default_graph(ClientMode.INGESTION)
214
215
 
215
216
  old_upstream_lineage = get_aspects_for_entity(
216
217
  client._session,
@@ -7,6 +7,7 @@ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Unio
7
7
  from mlflow import MlflowClient
8
8
  from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
9
9
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
10
+ from mlflow.exceptions import MlflowException
10
11
  from mlflow.store.entities import PagedList
11
12
  from pydantic.fields import Field
12
13
 
@@ -589,8 +590,8 @@ class MLflowSource(StatefulIngestionSourceBase):
589
590
  )
590
591
  return runs
591
592
 
592
- @staticmethod
593
593
  def _traverse_mlflow_search_func(
594
+ self,
594
595
  search_func: Callable[..., PagedList[T]],
595
596
  **kwargs: Any,
596
597
  ) -> Iterable[T]:
@@ -598,12 +599,24 @@ class MLflowSource(StatefulIngestionSourceBase):
598
599
  Utility to traverse an MLflow search_* functions which return PagedList.
599
600
  """
600
601
  next_page_token = None
601
- while True:
602
- paged_list = search_func(page_token=next_page_token, **kwargs)
603
- yield from paged_list.to_list()
604
- next_page_token = paged_list.token
605
- if not next_page_token:
602
+ try:
603
+ while True:
604
+ paged_list = search_func(page_token=next_page_token, **kwargs)
605
+ yield from paged_list.to_list()
606
+ next_page_token = paged_list.token
607
+ if not next_page_token:
608
+ return
609
+ except MlflowException as e:
610
+ if e.error_code == "ENDPOINT_NOT_FOUND":
611
+ self.report.warning(
612
+ title="MLflow API Endpoint Not Found for Experiments.",
613
+ message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
614
+ context=None,
615
+ exc=e,
616
+ )
606
617
  return
618
+ else:
619
+ raise # Only re-raise other exceptions
607
620
 
608
621
  def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
609
622
  return (
@@ -6,7 +6,7 @@ from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
7
  from functools import lru_cache
8
8
  from json import JSONDecodeError
9
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
9
+ from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
10
 
11
11
  import dateutil.parser as dp
12
12
  import pydantic
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
203
203
  pass
204
204
 
205
205
 
206
+ class HTTPError504(HTTPError):
207
+ pass
208
+
209
+
206
210
  ModeRequestError = (HTTPError, JSONDecodeError)
207
211
 
208
212
 
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
217
221
  num_query_template_render: int = 0
218
222
  num_query_template_render_failures: int = 0
219
223
  num_query_template_render_success: int = 0
224
+ num_requests_exceeding_rate_limit: int = 0
225
+ num_requests_retried_on_timeout: int = 0
226
+ num_spaces_retrieved: int = 0
220
227
 
221
228
  def report_dropped_space(self, ent_name: str) -> None:
222
229
  self.filtered_spaces.append(ent_name)
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
456
463
  # Datasets
457
464
  datasets = []
458
465
  for imported_dataset_name in report_info.get("imported_datasets", {}):
459
- mode_dataset = self._get_request_json(
460
- f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
461
- )
466
+ try:
467
+ mode_dataset = self._get_request_json(
468
+ f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
469
+ )
470
+ except HTTPError as http_error:
471
+ status_code = http_error.response.status_code
472
+ if status_code == 404:
473
+ self.report.report_warning(
474
+ title="Report Not Found",
475
+ message="Referenced report for reusable dataset was not found.",
476
+ context=f"Report: {report_info.get('id')}, "
477
+ f"Imported Dataset Report: {imported_dataset_name.get('token')}",
478
+ )
479
+ continue
480
+ else:
481
+ raise http_error
482
+
462
483
  dataset_urn = builder.make_dataset_urn_with_platform_instance(
463
484
  self.platform,
464
485
  str(mode_dataset.get("id")),
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
562
583
  space_info = {}
563
584
  try:
564
585
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
565
- payload = self._get_request_json(f"{self.workspace_uri}/spaces?filter=all")
566
- spaces = payload.get("_embedded", {}).get("spaces", {})
567
- logger.debug(
568
- f"Got {len(spaces)} spaces from workspace {self.workspace_uri}"
569
- )
570
- for s in spaces:
571
- logger.debug(f"Space: {s.get('name')}")
572
- space_name = s.get("name", "")
573
- # Using both restricted and default_access_level because
574
- # there is a current bug with restricted returning False everytime
575
- # which has been reported to Mode team
576
- if self.config.exclude_restricted and (
577
- s.get("restricted") or s.get("default_access_level") == "restricted"
578
- ):
579
- logging.debug(
580
- f"Skipping space {space_name} due to exclude restricted"
581
- )
582
- continue
583
- if not self.config.space_pattern.allowed(space_name):
584
- self.report.report_dropped_space(space_name)
585
- logging.debug(f"Skipping space {space_name} due to space pattern")
586
- continue
587
- space_info[s.get("token", "")] = s.get("name", "")
586
+ for spaces_page in self._get_paged_request_json(
587
+ f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
588
+ ):
589
+ logger.debug(
590
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
591
+ )
592
+ self.report.num_spaces_retrieved += len(spaces_page)
593
+ for s in spaces_page:
594
+ logger.debug(f"Space: {s.get('name')}")
595
+ space_name = s.get("name", "")
596
+ # Using both restricted and default_access_level because
597
+ # there is a current bug with restricted returning False everytime
598
+ # which has been reported to Mode team
599
+ if self.config.exclude_restricted and (
600
+ s.get("restricted")
601
+ or s.get("default_access_level") == "restricted"
602
+ ):
603
+ logging.debug(
604
+ f"Skipping space {space_name} due to exclude restricted"
605
+ )
606
+ continue
607
+ if not self.config.space_pattern.allowed(space_name):
608
+ self.report.report_dropped_space(space_name)
609
+ logging.debug(
610
+ f"Skipping space {space_name} due to space pattern"
611
+ )
612
+ continue
613
+ space_info[s.get("token", "")] = s.get("name", "")
588
614
  except ModeRequestError as e:
589
615
  self.report.report_failure(
590
616
  title="Failed to Retrieve Spaces",
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
1475
1501
  )
1476
1502
  return charts
1477
1503
 
1504
+ def _get_paged_request_json(
1505
+ self, url: str, key: str, per_page: int
1506
+ ) -> Iterator[List[Dict]]:
1507
+ page: int = 1
1508
+ while True:
1509
+ page_url = f"{url}&per_page={per_page}&page={page}"
1510
+ response = self._get_request_json(page_url)
1511
+ data: List[Dict] = response.get("_embedded", {}).get(key, [])
1512
+ if not data:
1513
+ break
1514
+ yield data
1515
+ page += 1
1516
+
1478
1517
  def _get_request_json(self, url: str) -> Dict:
1479
1518
  r = tenacity.Retrying(
1480
1519
  wait=wait_exponential(
1481
1520
  multiplier=self.config.api_options.retry_backoff_multiplier,
1482
1521
  max=self.config.api_options.max_retry_interval,
1483
1522
  ),
1484
- retry=retry_if_exception_type((HTTPError429, ConnectionError)),
1523
+ retry=retry_if_exception_type(
1524
+ (HTTPError429, HTTPError504, ConnectionError)
1525
+ ),
1485
1526
  stop=stop_after_attempt(self.config.api_options.max_attempts),
1486
1527
  )
1487
1528
 
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
1502
1543
  except HTTPError as http_error:
1503
1544
  error_response = http_error.response
1504
1545
  if error_response.status_code == 429:
1546
+ self.report.num_requests_exceeding_rate_limit += 1
1505
1547
  # respect Retry-After
1506
1548
  sleep_time = error_response.headers.get("retry-after")
1507
1549
  if sleep_time is not None:
1508
1550
  time.sleep(float(sleep_time))
1509
1551
  raise HTTPError429 from None
1552
+ elif error_response.status_code == 504:
1553
+ self.report.num_requests_retried_on_timeout += 1
1554
+ time.sleep(0.1)
1555
+ raise HTTPError504 from None
1510
1556
 
1511
1557
  logger.debug(
1512
1558
  f"Error response ({error_response.status_code}): {error_response.text}"
@@ -5,27 +5,35 @@ from typing import Any, Dict, Iterable, List, Optional, Type, Union
5
5
 
6
6
  import pandas as pd
7
7
  from neo4j import GraphDatabase
8
- from pydantic.fields import Field
8
+ from pydantic import Field
9
9
 
10
10
  from datahub.configuration.source_common import (
11
11
  EnvConfigMixin,
12
+ PlatformInstanceConfigMixin,
13
+ )
14
+ from datahub.emitter.mce_builder import (
15
+ make_data_platform_urn,
16
+ make_dataset_urn_with_platform_instance,
12
17
  )
13
- from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
14
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
19
  from datahub.ingestion.api.common import PipelineContext
16
20
  from datahub.ingestion.api.decorators import (
17
21
  SupportStatus,
22
+ capability,
18
23
  config_class,
19
24
  platform_name,
20
25
  support_status,
21
26
  )
22
27
  from datahub.ingestion.api.source import (
23
28
  MetadataWorkUnitProcessor,
29
+ SourceCapability,
24
30
  )
31
+ from datahub.ingestion.api.source_helpers import auto_workunit
25
32
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
33
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
34
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
35
  StaleEntityRemovalHandler,
36
+ StatefulStaleMetadataRemovalConfig,
29
37
  )
30
38
  from datahub.ingestion.source.state.stateful_ingestion_base import (
31
39
  StatefulIngestionConfigBase,
@@ -64,12 +72,16 @@ _type_mapping: Dict[Union[Type, str], Type] = {
64
72
  }
65
73
 
66
74
 
67
- class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
75
+ class Neo4jConfig(
76
+ StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
77
+ ):
68
78
  username: str = Field(description="Neo4j Username")
69
79
  password: str = Field(description="Neo4j Password")
70
80
  uri: str = Field(description="The URI for the Neo4j server")
71
81
  env: str = Field(description="Neo4j env")
72
82
 
83
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
84
+
73
85
 
74
86
  @dataclass
75
87
  class Neo4jSourceReport(StatefulIngestionReport):
@@ -79,21 +91,27 @@ class Neo4jSourceReport(StatefulIngestionReport):
79
91
 
80
92
  @platform_name("Neo4j", id="neo4j")
81
93
  @config_class(Neo4jConfig)
94
+ @capability(
95
+ SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
96
+ )
82
97
  @support_status(SupportStatus.CERTIFIED)
83
98
  class Neo4jSource(StatefulIngestionSourceBase):
84
99
  NODE = "node"
85
100
  RELATIONSHIP = "relationship"
86
- PLATFORM = "neo4j"
101
+ config: Neo4jConfig
102
+ report: Neo4jSourceReport
87
103
 
88
- def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
104
+ def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
105
+ super().__init__(config, ctx)
89
106
  self.ctx = ctx
90
107
  self.config = config
108
+ self.platform = "neo4j"
91
109
  self.report: Neo4jSourceReport = Neo4jSourceReport()
92
110
 
93
111
  @classmethod
94
- def create(cls, config_dict, ctx):
112
+ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
95
113
  config = Neo4jConfig.parse_obj(config_dict)
96
- return cls(ctx, config)
114
+ return cls(config, ctx)
97
115
 
98
116
  def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
99
117
  type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
@@ -123,34 +141,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
123
141
  dataset: str,
124
142
  description: Optional[str] = None,
125
143
  custom_properties: Optional[Dict[str, str]] = None,
126
- ) -> MetadataChangeProposalWrapper:
144
+ ) -> Iterable[MetadataWorkUnit]:
127
145
  dataset_properties = DatasetPropertiesClass(
128
146
  description=description,
129
147
  customProperties=custom_properties,
130
148
  )
131
- return MetadataChangeProposalWrapper(
132
- entityUrn=make_dataset_urn(
133
- platform=self.PLATFORM, name=dataset, env=self.config.env
149
+ yield MetadataChangeProposalWrapper(
150
+ entityUrn=make_dataset_urn_with_platform_instance(
151
+ platform=self.platform,
152
+ name=dataset,
153
+ platform_instance=self.config.platform_instance,
154
+ env=self.config.env,
134
155
  ),
135
156
  aspect=dataset_properties,
136
- )
157
+ ).as_workunit()
137
158
 
138
159
  def generate_neo4j_object(
139
160
  self, dataset: str, columns: list, obj_type: Optional[str] = None
140
- ) -> MetadataChangeProposalWrapper:
161
+ ) -> Optional[MetadataChangeProposalWrapper]:
141
162
  try:
142
163
  fields = [
143
164
  self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
144
165
  for d in columns
145
166
  for key, value in d.items()
146
167
  ]
147
- mcp = MetadataChangeProposalWrapper(
148
- entityUrn=make_dataset_urn(
149
- platform=self.PLATFORM, name=dataset, env=self.config.env
168
+ return MetadataChangeProposalWrapper(
169
+ entityUrn=make_dataset_urn_with_platform_instance(
170
+ platform=self.platform,
171
+ name=dataset,
172
+ platform_instance=self.config.platform_instance,
173
+ env=self.config.env,
150
174
  ),
151
175
  aspect=SchemaMetadataClass(
152
176
  schemaName=dataset,
153
- platform=make_data_platform_urn(self.PLATFORM),
177
+ platform=make_data_platform_urn(self.platform),
154
178
  version=0,
155
179
  hash="",
156
180
  platformSchema=OtherSchemaClass(rawSchema=""),
@@ -161,13 +185,16 @@ class Neo4jSource(StatefulIngestionSourceBase):
161
185
  fields=fields,
162
186
  ),
163
187
  )
164
- self.report.obj_created += 1
165
188
  except Exception as e:
166
189
  log.error(e)
167
- self.report.obj_failures += 1
168
- return mcp
190
+ self.report.report_failure(
191
+ message="Failed to process dataset",
192
+ context=dataset,
193
+ exc=e,
194
+ )
195
+ return None
169
196
 
170
- def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
197
+ def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
171
198
  driver = GraphDatabase.driver(
172
199
  self.config.uri, auth=(self.config.username, self.config.password)
173
200
  )
@@ -201,13 +228,14 @@ class Neo4jSource(StatefulIngestionSourceBase):
201
228
 
202
229
  union_cols = ["key", "obj_type", "property_data_types", "description"]
203
230
  df = pd.concat([node_df[union_cols], rel_df[union_cols]])
231
+ return df
204
232
  except Exception as e:
205
233
  self.report.failure(
206
234
  message="Failed to get neo4j metadata",
207
235
  exc=e,
208
236
  )
209
237
 
210
- return df
238
+ return None
211
239
 
212
240
  def process_nodes(self, data: list) -> pd.DataFrame:
213
241
  nodes = [record for record in data if record["value"]["type"] == self.NODE]
@@ -306,46 +334,48 @@ class Neo4jSource(StatefulIngestionSourceBase):
306
334
  df = self.get_neo4j_metadata(
307
335
  "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
308
336
  )
337
+ if df is None:
338
+ log.warning("No metadata retrieved from Neo4j")
339
+ return
340
+
309
341
  for _, row in df.iterrows():
310
342
  try:
311
- yield MetadataWorkUnit(
312
- id=row["key"],
313
- mcp=self.generate_neo4j_object(
314
- columns=row["property_data_types"],
315
- dataset=row["key"],
316
- ),
317
- is_primary_source=True,
343
+ neo4j_obj = self.generate_neo4j_object(
344
+ columns=row["property_data_types"],
345
+ dataset=row["key"],
318
346
  )
319
-
320
- yield MetadataWorkUnit(
321
- id=row["key"],
322
- mcp=MetadataChangeProposalWrapper(
323
- entityUrn=make_dataset_urn(
324
- platform=self.PLATFORM,
325
- name=row["key"],
326
- env=self.config.env,
327
- ),
328
- aspect=SubTypesClass(
329
- typeNames=[
330
- DatasetSubTypes.NEO4J_NODE
331
- if row["obj_type"] == self.NODE
332
- else DatasetSubTypes.NEO4J_RELATIONSHIP
333
- ]
334
- ),
347
+ if neo4j_obj:
348
+ yield from auto_workunit([neo4j_obj])
349
+
350
+ yield MetadataChangeProposalWrapper(
351
+ entityUrn=make_dataset_urn_with_platform_instance(
352
+ platform=self.platform,
353
+ name=row["key"],
354
+ platform_instance=self.config.platform_instance,
355
+ env=self.config.env,
335
356
  ),
336
- )
337
-
338
- yield MetadataWorkUnit(
339
- id=row["key"],
340
- mcp=self.add_properties(
341
- dataset=row["key"],
342
- custom_properties=None,
343
- description=row["description"],
357
+ aspect=SubTypesClass(
358
+ typeNames=[
359
+ DatasetSubTypes.NEO4J_NODE
360
+ if row["obj_type"] == self.NODE
361
+ else DatasetSubTypes.NEO4J_RELATIONSHIP
362
+ ]
344
363
  ),
364
+ ).as_workunit()
365
+
366
+ yield from self.add_properties(
367
+ dataset=row["key"],
368
+ custom_properties=None,
369
+ description=row["description"],
345
370
  )
346
371
 
347
372
  except Exception as e:
348
- raise e
373
+ log.error(f"Failed to process row {row['key']}: {str(e)}")
374
+ self.report.report_failure(
375
+ message="Error processing Neo4j metadata",
376
+ context=row["key"],
377
+ exc=e,
378
+ )
349
379
 
350
- def get_report(self):
380
+ def get_report(self) -> "Neo4jSourceReport":
351
381
  return self.report