acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -22,6 +22,7 @@ from typing import (
22
22
  Union,
23
23
  )
24
24
 
25
+ import progressbar
25
26
  from avro.schema import RecordSchema
26
27
  from pydantic import BaseModel
27
28
  from requests.models import HTTPError
@@ -159,6 +160,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
159
160
  openapi_ingestion=self.config.openapi_ingestion,
160
161
  client_mode=config.client_mode,
161
162
  datahub_component=config.datahub_component,
163
+ server_config_refresh_interval=config.server_config_refresh_interval,
162
164
  )
163
165
  self.server_id: str = _MISSING_SERVER_ID
164
166
 
@@ -234,6 +236,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
234
236
  client_certificate_path=session_config.client_certificate_path,
235
237
  client_mode=session_config.client_mode,
236
238
  datahub_component=session_config.datahub_component,
239
+ server_config_refresh_interval=emitter._server_config_refresh_interval,
237
240
  )
238
241
  )
239
242
 
@@ -502,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
502
505
  "limit": limit,
503
506
  "filter": filter,
504
507
  }
505
- end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
508
+ end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
506
509
  resp: Dict = self._post_generic(end_point, query_body)
507
510
 
508
511
  values: Optional[List] = resp.get("value", {}).get("values")
@@ -522,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
522
525
  def get_entity_raw(
523
526
  self, entity_urn: str, aspects: Optional[List[str]] = None
524
527
  ) -> Dict:
525
- endpoint: str = f"{self.config.server}/entitiesV2/{Urn.url_encode(entity_urn)}"
528
+ endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
526
529
  if aspects is not None:
527
530
  assert aspects, "if provided, aspects must be a non-empty list"
528
531
  endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
@@ -652,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
652
655
 
653
656
  @property
654
657
  def _search_endpoint(self):
655
- return f"{self.config.server}/entities?action=search"
658
+ return f"{self._gms_server}/entities?action=search"
656
659
 
657
660
  @property
658
661
  def _relationships_endpoint(self):
659
- return f"{self.config.server}/openapi/relationships/v1/"
662
+ return f"{self._gms_server}/openapi/relationships/v1/"
660
663
 
661
664
  @property
662
665
  def _aspect_count_endpoint(self):
663
- return f"{self.config.server}/aspects?action=getCount"
666
+ return f"{self._gms_server}/aspects?action=getCount"
664
667
 
665
668
  def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
666
669
  """Retrieve a domain urn based on its name. Returns None if there is no match found"""
@@ -806,7 +809,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
806
809
  "input": search_query,
807
810
  "entity": "container",
808
811
  "start": 0,
809
- "count": 10000,
812
+ "count": 5000,
810
813
  "filter": {"or": container_filters},
811
814
  }
812
815
  results: Dict = self._post_generic(url, search_body)
@@ -901,9 +904,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
901
904
  query: Optional[str] = None,
902
905
  container: Optional[str] = None,
903
906
  status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
904
- batch_size: int = 10000,
907
+ batch_size: int = 5000,
905
908
  extraFilters: Optional[List[RawSearchFilterRule]] = None,
906
909
  extra_or_filters: Optional[RawSearchFilter] = None,
910
+ skip_cache: bool = False,
907
911
  ) -> Iterable[str]:
908
912
  """Fetch all urns that match all of the given filters.
909
913
 
@@ -922,6 +926,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
922
926
  Note that this requires browsePathV2 aspects (added in 0.10.4+).
923
927
  :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
924
928
  :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
929
+ :param skip_cache: Whether to bypass caching. Defaults to False.
925
930
 
926
931
  :return: An iterable of urns that match the filters.
927
932
  """
@@ -949,7 +954,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
949
954
  $query: String!,
950
955
  $orFilters: [AndFilterInput!],
951
956
  $batchSize: Int!,
952
- $scrollId: String) {
957
+ $scrollId: String,
958
+ $skipCache: Boolean!) {
953
959
 
954
960
  scrollAcrossEntities(input: {
955
961
  query: $query,
@@ -960,6 +966,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
960
966
  searchFlags: {
961
967
  skipHighlighting: true
962
968
  skipAggregates: true
969
+ skipCache: $skipCache
963
970
  }
964
971
  }) {
965
972
  nextScrollId
@@ -978,6 +985,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
978
985
  "query": query,
979
986
  "orFilters": orFilters,
980
987
  "batchSize": batch_size,
988
+ "skipCache": skip_cache,
981
989
  }
982
990
 
983
991
  for entity in self._scroll_across_entities(graphql_query, variables):
@@ -993,7 +1001,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
993
1001
  query: Optional[str] = None,
994
1002
  container: Optional[str] = None,
995
1003
  status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
996
- batch_size: int = 10000,
1004
+ batch_size: int = 5000,
997
1005
  extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
998
1006
  extra_or_filters: Optional[RawSearchFilter] = None,
999
1007
  extra_source_fields: Optional[List[str]] = None,
@@ -1083,7 +1091,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1083
1091
  "query": query,
1084
1092
  "orFilters": or_filters_final,
1085
1093
  "batchSize": batch_size,
1086
- "skipCache": "true" if skip_cache else "false",
1094
+ "skipCache": skip_cache,
1087
1095
  "fetchExtraFields": extra_source_fields,
1088
1096
  }
1089
1097
 
@@ -1202,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1202
1210
  operation_name: Optional[str] = None,
1203
1211
  format_exception: bool = True,
1204
1212
  ) -> Dict:
1205
- url = f"{self.config.server}/api/graphql"
1213
+ url = f"{self._gms_server}/api/graphql"
1206
1214
 
1207
1215
  body: Dict = {
1208
1216
  "query": query,
@@ -1427,6 +1435,83 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1427
1435
  related_aspects = response.get("relatedAspects", [])
1428
1436
  return reference_count, related_aspects
1429
1437
 
1438
+ def get_kafka_consumer_offsets(
1439
+ self,
1440
+ ) -> dict:
1441
+ """
1442
+ Get Kafka consumer offsets from the DataHub API.
1443
+
1444
+ Args:
1445
+ graph (DataHubGraph): The DataHub graph client
1446
+
1447
+ """
1448
+ urls = {
1449
+ "mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
1450
+ "mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
1451
+ "mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
1452
+ }
1453
+
1454
+ params = {"skipCache": "true", "detailed": "true"}
1455
+ results = {}
1456
+ for key, url in urls.items():
1457
+ response = self._get_generic(url=url, params=params)
1458
+ results[key] = response
1459
+ if "errors" in response:
1460
+ logger.error(f"Error: {response['errors']}")
1461
+ return results
1462
+
1463
+ def _restore_index_call(self, payload_obj: dict) -> None:
1464
+ result = self._post_generic(
1465
+ f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1466
+ )
1467
+ logger.debug(f"Restore indices result: {result}")
1468
+
1469
+ def restore_indices(
1470
+ self,
1471
+ urn_pattern: Optional[str] = None,
1472
+ aspect: Optional[str] = None,
1473
+ start: Optional[int] = None,
1474
+ batch_size: Optional[int] = None,
1475
+ file: Optional[str] = None,
1476
+ ) -> None:
1477
+ """Restore the indices for a given urn or urn-like pattern.
1478
+
1479
+ Args:
1480
+ urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
1481
+ aspect: Optional aspect string to restore indices for a specific aspect.
1482
+ start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
1483
+ batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
1484
+ file: Optional file path to a file containing URNs to restore indices for.
1485
+
1486
+ Returns:
1487
+ A string containing the result of the restore indices operation. This format is subject to change.
1488
+ """
1489
+ payload_obj = {}
1490
+ if file is not None:
1491
+ with open(file) as f:
1492
+ for urn in progressbar.progressbar(f.readlines()):
1493
+ urn = urn.strip()
1494
+ if "%" in urn:
1495
+ payload_obj["urnLike"] = urn
1496
+ else:
1497
+ payload_obj["urn"] = urn
1498
+ if aspect is not None:
1499
+ payload_obj["aspect"] = aspect
1500
+ self._restore_index_call(payload_obj)
1501
+ else:
1502
+ if urn_pattern is not None:
1503
+ if "%" in urn_pattern:
1504
+ payload_obj["urnLike"] = urn_pattern
1505
+ else:
1506
+ payload_obj["urn"] = urn_pattern
1507
+ if aspect is not None:
1508
+ payload_obj["aspect"] = aspect
1509
+ if start is not None:
1510
+ payload_obj["start"] = start
1511
+ if batch_size is not None:
1512
+ payload_obj["batchSize"] = batch_size
1513
+ self._restore_index_call(payload_obj)
1514
+
1430
1515
  @functools.lru_cache
1431
1516
  def _make_schema_resolver(
1432
1517
  self,
@@ -1491,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1491
1576
  env: str = DEFAULT_ENV,
1492
1577
  default_db: Optional[str] = None,
1493
1578
  default_schema: Optional[str] = None,
1494
- default_dialect: Optional[str] = None,
1579
+ override_dialect: Optional[str] = None,
1495
1580
  ) -> "SqlParsingResult":
1496
1581
  from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
1497
1582
 
@@ -1505,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1505
1590
  schema_resolver=schema_resolver,
1506
1591
  default_db=default_db,
1507
1592
  default_schema=default_schema,
1508
- default_dialect=default_dialect,
1593
+ override_dialect=override_dialect,
1509
1594
  )
1510
1595
 
1511
1596
  def create_tag(self, tag_name: str) -> str:
@@ -1732,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1732
1817
  "Accept": "application/json",
1733
1818
  "Content-Type": "application/json",
1734
1819
  }
1735
- url = f"{self.config.server}/openapi/v2/entity/batch/{entity_name}"
1820
+ url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
1736
1821
  response = self._session.post(url, data=json.dumps(payload), headers=headers)
1737
1822
  response.raise_for_status()
1738
1823
 
@@ -1789,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1789
1874
  "Content-Type": "application/json",
1790
1875
  }
1791
1876
 
1792
- url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
1877
+ url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
1793
1878
  if with_system_metadata:
1794
1879
  url += "?systemMetadata=true"
1795
1880
 
@@ -29,6 +29,7 @@ class DatahubClientConfig(ConfigModel):
29
29
  openapi_ingestion: Optional[bool] = None
30
30
  client_mode: Optional[ClientMode] = None
31
31
  datahub_component: Optional[str] = None
32
+ server_config_refresh_interval: Optional[int] = None
32
33
 
33
34
  class Config:
34
35
  extra = "ignore"
@@ -13,6 +13,7 @@ from datahub.configuration.common import (
13
13
  from datahub.emitter.aspect import JSON_CONTENT_TYPE
14
14
  from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn
15
15
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
+ from datahub.emitter.rest_emitter import EmitMode
16
17
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
17
18
  from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
18
19
  from datahub.ingestion.api.sink import NoopWriteCallback, Sink
@@ -111,6 +112,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
111
112
  def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> None:
112
113
  assert ctx.pipeline_config is not None
113
114
 
115
+ self.ctx = ctx
114
116
  self.sink: Sink = sink
115
117
  self.report_recipe = report_recipe
116
118
  ingestion_source_key = self.generate_unique_key(ctx.pipeline_config)
@@ -191,18 +193,25 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
191
193
  )
192
194
  return json.dumps(converted_recipe)
193
195
 
194
- def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
195
- self.sink.write_record_async(
196
- RecordEnvelope(
197
- record=MetadataChangeProposalWrapper(
198
- entityUrn=str(entity_urn),
199
- aspect=aspect_value,
200
- ),
201
- metadata={},
202
- ),
203
- NoopWriteCallback(),
196
+ def _emit_aspect(
197
+ self, entity_urn: Urn, aspect_value: _Aspect, try_sync: bool = False
198
+ ) -> None:
199
+ mcp = MetadataChangeProposalWrapper(
200
+ entityUrn=str(entity_urn),
201
+ aspect=aspect_value,
204
202
  )
205
203
 
204
+ if try_sync and self.ctx.graph:
205
+ self.ctx.graph.emit_mcp(mcp, emit_mode=EmitMode.SYNC_PRIMARY)
206
+ else:
207
+ self.sink.write_record_async(
208
+ RecordEnvelope(
209
+ record=mcp,
210
+ metadata={},
211
+ ),
212
+ NoopWriteCallback(),
213
+ )
214
+
206
215
  def on_start(self, ctx: PipelineContext) -> None:
207
216
  assert ctx.pipeline_config is not None
208
217
  # Construct the dataHubExecutionRequestInput aspect
@@ -223,6 +232,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
223
232
  self._emit_aspect(
224
233
  entity_urn=self.execution_request_input_urn,
225
234
  aspect_value=execution_input_aspect,
235
+ try_sync=True,
226
236
  )
227
237
 
228
238
  def on_completion(
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
44
44
  from datahub.sdk._attribution import KnownAttribution, change_default_attribution
45
45
  from datahub.telemetry import stats
46
46
  from datahub.telemetry.telemetry import telemetry_instance
47
+ from datahub.upgrade.upgrade import (
48
+ is_server_default_cli_ahead,
49
+ retrieve_version_stats,
50
+ )
47
51
  from datahub.utilities._custom_package_loader import model_version_name
48
52
  from datahub.utilities.global_warning_util import (
49
53
  clear_global_warnings,
@@ -171,7 +175,10 @@ class Pipeline:
171
175
  self.last_time_printed = int(time.time())
172
176
  self.cli_report = CliReport()
173
177
 
174
- with contextlib.ExitStack() as exit_stack, contextlib.ExitStack() as inner_exit_stack:
178
+ with (
179
+ contextlib.ExitStack() as exit_stack,
180
+ contextlib.ExitStack() as inner_exit_stack,
181
+ ):
175
182
  self.graph: Optional[DataHubGraph] = None
176
183
  with _add_init_error_context("connect to DataHub"):
177
184
  if self.config.datahub_api:
@@ -340,6 +347,44 @@ class Pipeline:
340
347
  except Exception as e:
341
348
  logger.warning("Reporting failed on start", exc_info=e)
342
349
 
350
+ def _warn_old_cli_version(self) -> None:
351
+ """
352
+ Check if the server default CLI version is ahead of the CLI version being used.
353
+ If so, add a warning to the report.
354
+ """
355
+
356
+ try:
357
+ version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
358
+ except RuntimeError as e:
359
+ # Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
360
+ if "no current event loop" in str(e):
361
+ logger.debug("Skipping version check - no event loop available")
362
+ return
363
+ raise
364
+
365
+ if not version_stats or not self.graph:
366
+ return
367
+
368
+ if is_server_default_cli_ahead(version_stats):
369
+ server_default_version = (
370
+ version_stats.server.current_server_default_cli_version.version
371
+ if version_stats.server.current_server_default_cli_version
372
+ else None
373
+ )
374
+ current_version = version_stats.client.current.version
375
+
376
+ logger.debug(f"""
377
+ client_version: {current_version}
378
+ server_default_version: {server_default_version}
379
+ server_default_cli_ahead: True
380
+ """)
381
+
382
+ self.source.get_report().warning(
383
+ title="Server default CLI version is ahead of CLI version",
384
+ message="Please upgrade the CLI version being used",
385
+ context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
386
+ )
387
+
343
388
  def _notify_reporters_on_ingestion_completion(self) -> None:
344
389
  for reporter in self.reporters:
345
390
  try:
@@ -396,6 +441,7 @@ class Pipeline:
396
441
  return False
397
442
 
398
443
  def run(self) -> None:
444
+ self._warn_old_cli_version()
399
445
  with self.exit_stack, self.inner_exit_stack:
400
446
  if self.config.flags.generate_memory_profiles:
401
447
  import memray
@@ -502,7 +548,7 @@ class Pipeline:
502
548
  self._handle_uncaught_pipeline_exception(exc)
503
549
  finally:
504
550
  clear_global_warnings()
505
-
551
+ self.sink.flush()
506
552
  self._notify_reporters_on_ingestion_completion()
507
553
 
508
554
  def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
@@ -578,11 +624,17 @@ class Pipeline:
578
624
  sink_failures = len(self.sink.get_report().failures)
579
625
  sink_warnings = len(self.sink.get_report().warnings)
580
626
  global_warnings = len(get_global_warnings())
627
+ source_aspects = self.source.get_report().get_aspects_dict()
628
+ source_aspects_by_subtype = (
629
+ self.source.get_report().get_aspects_by_subtypes_dict()
630
+ )
581
631
 
582
632
  telemetry_instance.ping(
583
633
  "ingest_stats",
584
634
  {
585
635
  "source_type": self.source_type,
636
+ "source_aspects": source_aspects,
637
+ "source_aspects_by_subtype": source_aspects_by_subtype,
586
638
  "sink_type": self.sink_type,
587
639
  "transformer_types": [
588
640
  transformer.type for transformer in self.config.transformers or []
@@ -5,6 +5,7 @@ import functools
5
5
  import logging
6
6
  import os
7
7
  import threading
8
+ import time
8
9
  import uuid
9
10
  from enum import auto
10
11
  from typing import List, Optional, Tuple, Union
@@ -70,6 +71,7 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
70
71
  class DatahubRestSinkConfig(DatahubClientConfig):
71
72
  mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
72
73
  endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
74
+ server_config_refresh_interval: Optional[int] = None
73
75
 
74
76
  # These only apply in async modes.
75
77
  max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
@@ -345,6 +347,17 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
345
347
  RecordEnvelope(item, metadata={}), NoopWriteCallback()
346
348
  )
347
349
 
350
+ def flush(self) -> None:
351
+ """Wait for all pending records to be written."""
352
+ i = 0
353
+ while self.report.pending_requests > 0:
354
+ time.sleep(0.1)
355
+ i += 1
356
+ if i % 1000 == 0:
357
+ logger.info(
358
+ f"Waiting for {self.report.pending_requests} records to be written"
359
+ )
360
+
348
361
  def close(self):
349
362
  with self.report.main_thread_blocking_timer:
350
363
  self.executor.shutdown()
@@ -533,7 +533,7 @@ class ABSSource(StatefulIngestionSourceBase):
533
533
  )
534
534
  path_spec.sample_files = False
535
535
  for obj in container_client.list_blobs(
536
- prefix=f"{prefix}", results_per_page=PAGE_SIZE
536
+ name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
537
537
  ):
538
538
  abs_path = self.create_abs_path(obj.name)
539
539
  logger.debug(f"Path: {abs_path}")
@@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
24
24
  if TYPE_CHECKING:
25
25
  from mypy_boto3_dynamodb import DynamoDBClient
26
26
  from mypy_boto3_glue import GlueClient
27
+ from mypy_boto3_lakeformation import LakeFormationClient
27
28
  from mypy_boto3_s3 import S3Client, S3ServiceResource
28
29
  from mypy_boto3_sagemaker import SageMakerClient
29
30
  from mypy_boto3_sts import STSClient
@@ -454,6 +455,9 @@ class AwsConnectionConfig(ConfigModel):
454
455
  def get_sagemaker_client(self) -> "SageMakerClient":
455
456
  return self.get_session().client("sagemaker", config=self._aws_config())
456
457
 
458
+ def get_lakeformation_client(self) -> "LakeFormationClient":
459
+ return self.get_session().client("lakeformation", config=self._aws_config())
460
+
457
461
 
458
462
  class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
459
463
  """