acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1433 -546
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17736 -17112
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -22,6 +22,7 @@ from typing import (
|
|
|
22
22
|
Union,
|
|
23
23
|
)
|
|
24
24
|
|
|
25
|
+
import progressbar
|
|
25
26
|
from avro.schema import RecordSchema
|
|
26
27
|
from pydantic import BaseModel
|
|
27
28
|
from requests.models import HTTPError
|
|
@@ -159,6 +160,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
159
160
|
openapi_ingestion=self.config.openapi_ingestion,
|
|
160
161
|
client_mode=config.client_mode,
|
|
161
162
|
datahub_component=config.datahub_component,
|
|
163
|
+
server_config_refresh_interval=config.server_config_refresh_interval,
|
|
162
164
|
)
|
|
163
165
|
self.server_id: str = _MISSING_SERVER_ID
|
|
164
166
|
|
|
@@ -234,6 +236,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
234
236
|
client_certificate_path=session_config.client_certificate_path,
|
|
235
237
|
client_mode=session_config.client_mode,
|
|
236
238
|
datahub_component=session_config.datahub_component,
|
|
239
|
+
server_config_refresh_interval=emitter._server_config_refresh_interval,
|
|
237
240
|
)
|
|
238
241
|
)
|
|
239
242
|
|
|
@@ -502,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
502
505
|
"limit": limit,
|
|
503
506
|
"filter": filter,
|
|
504
507
|
}
|
|
505
|
-
end_point = f"{self.
|
|
508
|
+
end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
|
|
506
509
|
resp: Dict = self._post_generic(end_point, query_body)
|
|
507
510
|
|
|
508
511
|
values: Optional[List] = resp.get("value", {}).get("values")
|
|
@@ -522,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
522
525
|
def get_entity_raw(
|
|
523
526
|
self, entity_urn: str, aspects: Optional[List[str]] = None
|
|
524
527
|
) -> Dict:
|
|
525
|
-
endpoint: str = f"{self.
|
|
528
|
+
endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
|
|
526
529
|
if aspects is not None:
|
|
527
530
|
assert aspects, "if provided, aspects must be a non-empty list"
|
|
528
531
|
endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
|
|
@@ -652,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
652
655
|
|
|
653
656
|
@property
|
|
654
657
|
def _search_endpoint(self):
|
|
655
|
-
return f"{self.
|
|
658
|
+
return f"{self._gms_server}/entities?action=search"
|
|
656
659
|
|
|
657
660
|
@property
|
|
658
661
|
def _relationships_endpoint(self):
|
|
659
|
-
return f"{self.
|
|
662
|
+
return f"{self._gms_server}/openapi/relationships/v1/"
|
|
660
663
|
|
|
661
664
|
@property
|
|
662
665
|
def _aspect_count_endpoint(self):
|
|
663
|
-
return f"{self.
|
|
666
|
+
return f"{self._gms_server}/aspects?action=getCount"
|
|
664
667
|
|
|
665
668
|
def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
|
|
666
669
|
"""Retrieve a domain urn based on its name. Returns None if there is no match found"""
|
|
@@ -806,7 +809,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
806
809
|
"input": search_query,
|
|
807
810
|
"entity": "container",
|
|
808
811
|
"start": 0,
|
|
809
|
-
"count":
|
|
812
|
+
"count": 5000,
|
|
810
813
|
"filter": {"or": container_filters},
|
|
811
814
|
}
|
|
812
815
|
results: Dict = self._post_generic(url, search_body)
|
|
@@ -901,9 +904,10 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
901
904
|
query: Optional[str] = None,
|
|
902
905
|
container: Optional[str] = None,
|
|
903
906
|
status: Optional[RemovedStatusFilter] = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
904
|
-
batch_size: int =
|
|
907
|
+
batch_size: int = 5000,
|
|
905
908
|
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
906
909
|
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
910
|
+
skip_cache: bool = False,
|
|
907
911
|
) -> Iterable[str]:
|
|
908
912
|
"""Fetch all urns that match all of the given filters.
|
|
909
913
|
|
|
@@ -922,6 +926,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
922
926
|
Note that this requires browsePathV2 aspects (added in 0.10.4+).
|
|
923
927
|
:param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
|
|
924
928
|
:param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
|
|
929
|
+
:param skip_cache: Whether to bypass caching. Defaults to False.
|
|
925
930
|
|
|
926
931
|
:return: An iterable of urns that match the filters.
|
|
927
932
|
"""
|
|
@@ -949,7 +954,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
949
954
|
$query: String!,
|
|
950
955
|
$orFilters: [AndFilterInput!],
|
|
951
956
|
$batchSize: Int!,
|
|
952
|
-
$scrollId: String
|
|
957
|
+
$scrollId: String,
|
|
958
|
+
$skipCache: Boolean!) {
|
|
953
959
|
|
|
954
960
|
scrollAcrossEntities(input: {
|
|
955
961
|
query: $query,
|
|
@@ -960,6 +966,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
960
966
|
searchFlags: {
|
|
961
967
|
skipHighlighting: true
|
|
962
968
|
skipAggregates: true
|
|
969
|
+
skipCache: $skipCache
|
|
963
970
|
}
|
|
964
971
|
}) {
|
|
965
972
|
nextScrollId
|
|
@@ -978,6 +985,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
978
985
|
"query": query,
|
|
979
986
|
"orFilters": orFilters,
|
|
980
987
|
"batchSize": batch_size,
|
|
988
|
+
"skipCache": skip_cache,
|
|
981
989
|
}
|
|
982
990
|
|
|
983
991
|
for entity in self._scroll_across_entities(graphql_query, variables):
|
|
@@ -993,7 +1001,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
993
1001
|
query: Optional[str] = None,
|
|
994
1002
|
container: Optional[str] = None,
|
|
995
1003
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
996
|
-
batch_size: int =
|
|
1004
|
+
batch_size: int = 5000,
|
|
997
1005
|
extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
|
|
998
1006
|
extra_or_filters: Optional[RawSearchFilter] = None,
|
|
999
1007
|
extra_source_fields: Optional[List[str]] = None,
|
|
@@ -1083,7 +1091,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1083
1091
|
"query": query,
|
|
1084
1092
|
"orFilters": or_filters_final,
|
|
1085
1093
|
"batchSize": batch_size,
|
|
1086
|
-
"skipCache":
|
|
1094
|
+
"skipCache": skip_cache,
|
|
1087
1095
|
"fetchExtraFields": extra_source_fields,
|
|
1088
1096
|
}
|
|
1089
1097
|
|
|
@@ -1202,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1202
1210
|
operation_name: Optional[str] = None,
|
|
1203
1211
|
format_exception: bool = True,
|
|
1204
1212
|
) -> Dict:
|
|
1205
|
-
url = f"{self.
|
|
1213
|
+
url = f"{self._gms_server}/api/graphql"
|
|
1206
1214
|
|
|
1207
1215
|
body: Dict = {
|
|
1208
1216
|
"query": query,
|
|
@@ -1427,6 +1435,83 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1427
1435
|
related_aspects = response.get("relatedAspects", [])
|
|
1428
1436
|
return reference_count, related_aspects
|
|
1429
1437
|
|
|
1438
|
+
def get_kafka_consumer_offsets(
|
|
1439
|
+
self,
|
|
1440
|
+
) -> dict:
|
|
1441
|
+
"""
|
|
1442
|
+
Get Kafka consumer offsets from the DataHub API.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
graph (DataHubGraph): The DataHub graph client
|
|
1446
|
+
|
|
1447
|
+
"""
|
|
1448
|
+
urls = {
|
|
1449
|
+
"mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
|
|
1450
|
+
"mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
|
|
1451
|
+
"mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
params = {"skipCache": "true", "detailed": "true"}
|
|
1455
|
+
results = {}
|
|
1456
|
+
for key, url in urls.items():
|
|
1457
|
+
response = self._get_generic(url=url, params=params)
|
|
1458
|
+
results[key] = response
|
|
1459
|
+
if "errors" in response:
|
|
1460
|
+
logger.error(f"Error: {response['errors']}")
|
|
1461
|
+
return results
|
|
1462
|
+
|
|
1463
|
+
def _restore_index_call(self, payload_obj: dict) -> None:
|
|
1464
|
+
result = self._post_generic(
|
|
1465
|
+
f"{self._gms_server}/operations?action=restoreIndices", payload_obj
|
|
1466
|
+
)
|
|
1467
|
+
logger.debug(f"Restore indices result: {result}")
|
|
1468
|
+
|
|
1469
|
+
def restore_indices(
|
|
1470
|
+
self,
|
|
1471
|
+
urn_pattern: Optional[str] = None,
|
|
1472
|
+
aspect: Optional[str] = None,
|
|
1473
|
+
start: Optional[int] = None,
|
|
1474
|
+
batch_size: Optional[int] = None,
|
|
1475
|
+
file: Optional[str] = None,
|
|
1476
|
+
) -> None:
|
|
1477
|
+
"""Restore the indices for a given urn or urn-like pattern.
|
|
1478
|
+
|
|
1479
|
+
Args:
|
|
1480
|
+
urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
|
|
1481
|
+
aspect: Optional aspect string to restore indices for a specific aspect.
|
|
1482
|
+
start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
|
|
1483
|
+
batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
|
|
1484
|
+
file: Optional file path to a file containing URNs to restore indices for.
|
|
1485
|
+
|
|
1486
|
+
Returns:
|
|
1487
|
+
A string containing the result of the restore indices operation. This format is subject to change.
|
|
1488
|
+
"""
|
|
1489
|
+
payload_obj = {}
|
|
1490
|
+
if file is not None:
|
|
1491
|
+
with open(file) as f:
|
|
1492
|
+
for urn in progressbar.progressbar(f.readlines()):
|
|
1493
|
+
urn = urn.strip()
|
|
1494
|
+
if "%" in urn:
|
|
1495
|
+
payload_obj["urnLike"] = urn
|
|
1496
|
+
else:
|
|
1497
|
+
payload_obj["urn"] = urn
|
|
1498
|
+
if aspect is not None:
|
|
1499
|
+
payload_obj["aspect"] = aspect
|
|
1500
|
+
self._restore_index_call(payload_obj)
|
|
1501
|
+
else:
|
|
1502
|
+
if urn_pattern is not None:
|
|
1503
|
+
if "%" in urn_pattern:
|
|
1504
|
+
payload_obj["urnLike"] = urn_pattern
|
|
1505
|
+
else:
|
|
1506
|
+
payload_obj["urn"] = urn_pattern
|
|
1507
|
+
if aspect is not None:
|
|
1508
|
+
payload_obj["aspect"] = aspect
|
|
1509
|
+
if start is not None:
|
|
1510
|
+
payload_obj["start"] = start
|
|
1511
|
+
if batch_size is not None:
|
|
1512
|
+
payload_obj["batchSize"] = batch_size
|
|
1513
|
+
self._restore_index_call(payload_obj)
|
|
1514
|
+
|
|
1430
1515
|
@functools.lru_cache
|
|
1431
1516
|
def _make_schema_resolver(
|
|
1432
1517
|
self,
|
|
@@ -1491,7 +1576,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1491
1576
|
env: str = DEFAULT_ENV,
|
|
1492
1577
|
default_db: Optional[str] = None,
|
|
1493
1578
|
default_schema: Optional[str] = None,
|
|
1494
|
-
|
|
1579
|
+
override_dialect: Optional[str] = None,
|
|
1495
1580
|
) -> "SqlParsingResult":
|
|
1496
1581
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
|
1497
1582
|
|
|
@@ -1505,7 +1590,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1505
1590
|
schema_resolver=schema_resolver,
|
|
1506
1591
|
default_db=default_db,
|
|
1507
1592
|
default_schema=default_schema,
|
|
1508
|
-
|
|
1593
|
+
override_dialect=override_dialect,
|
|
1509
1594
|
)
|
|
1510
1595
|
|
|
1511
1596
|
def create_tag(self, tag_name: str) -> str:
|
|
@@ -1732,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1732
1817
|
"Accept": "application/json",
|
|
1733
1818
|
"Content-Type": "application/json",
|
|
1734
1819
|
}
|
|
1735
|
-
url = f"{self.
|
|
1820
|
+
url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
|
|
1736
1821
|
response = self._session.post(url, data=json.dumps(payload), headers=headers)
|
|
1737
1822
|
response.raise_for_status()
|
|
1738
1823
|
|
|
@@ -1789,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1789
1874
|
"Content-Type": "application/json",
|
|
1790
1875
|
}
|
|
1791
1876
|
|
|
1792
|
-
url = f"{self.
|
|
1877
|
+
url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
|
|
1793
1878
|
if with_system_metadata:
|
|
1794
1879
|
url += "?systemMetadata=true"
|
|
1795
1880
|
|
|
@@ -29,6 +29,7 @@ class DatahubClientConfig(ConfigModel):
|
|
|
29
29
|
openapi_ingestion: Optional[bool] = None
|
|
30
30
|
client_mode: Optional[ClientMode] = None
|
|
31
31
|
datahub_component: Optional[str] = None
|
|
32
|
+
server_config_refresh_interval: Optional[int] = None
|
|
32
33
|
|
|
33
34
|
class Config:
|
|
34
35
|
extra = "ignore"
|
|
@@ -13,6 +13,7 @@ from datahub.configuration.common import (
|
|
|
13
13
|
from datahub.emitter.aspect import JSON_CONTENT_TYPE
|
|
14
14
|
from datahub.emitter.mce_builder import datahub_guid, make_data_platform_urn
|
|
15
15
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
16
|
+
from datahub.emitter.rest_emitter import EmitMode
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
|
17
18
|
from datahub.ingestion.api.pipeline_run_listener import PipelineRunListener
|
|
18
19
|
from datahub.ingestion.api.sink import NoopWriteCallback, Sink
|
|
@@ -111,6 +112,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
111
112
|
def __init__(self, sink: Sink, report_recipe: bool, ctx: PipelineContext) -> None:
|
|
112
113
|
assert ctx.pipeline_config is not None
|
|
113
114
|
|
|
115
|
+
self.ctx = ctx
|
|
114
116
|
self.sink: Sink = sink
|
|
115
117
|
self.report_recipe = report_recipe
|
|
116
118
|
ingestion_source_key = self.generate_unique_key(ctx.pipeline_config)
|
|
@@ -191,18 +193,25 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
191
193
|
)
|
|
192
194
|
return json.dumps(converted_recipe)
|
|
193
195
|
|
|
194
|
-
def _emit_aspect(
|
|
195
|
-
self
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
),
|
|
201
|
-
metadata={},
|
|
202
|
-
),
|
|
203
|
-
NoopWriteCallback(),
|
|
196
|
+
def _emit_aspect(
|
|
197
|
+
self, entity_urn: Urn, aspect_value: _Aspect, try_sync: bool = False
|
|
198
|
+
) -> None:
|
|
199
|
+
mcp = MetadataChangeProposalWrapper(
|
|
200
|
+
entityUrn=str(entity_urn),
|
|
201
|
+
aspect=aspect_value,
|
|
204
202
|
)
|
|
205
203
|
|
|
204
|
+
if try_sync and self.ctx.graph:
|
|
205
|
+
self.ctx.graph.emit_mcp(mcp, emit_mode=EmitMode.SYNC_PRIMARY)
|
|
206
|
+
else:
|
|
207
|
+
self.sink.write_record_async(
|
|
208
|
+
RecordEnvelope(
|
|
209
|
+
record=mcp,
|
|
210
|
+
metadata={},
|
|
211
|
+
),
|
|
212
|
+
NoopWriteCallback(),
|
|
213
|
+
)
|
|
214
|
+
|
|
206
215
|
def on_start(self, ctx: PipelineContext) -> None:
|
|
207
216
|
assert ctx.pipeline_config is not None
|
|
208
217
|
# Construct the dataHubExecutionRequestInput aspect
|
|
@@ -223,6 +232,7 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|
|
223
232
|
self._emit_aspect(
|
|
224
233
|
entity_urn=self.execution_request_input_urn,
|
|
225
234
|
aspect_value=execution_input_aspect,
|
|
235
|
+
try_sync=True,
|
|
226
236
|
)
|
|
227
237
|
|
|
228
238
|
def on_completion(
|
|
@@ -44,6 +44,10 @@ from datahub.ingestion.transformer.transform_registry import transform_registry
|
|
|
44
44
|
from datahub.sdk._attribution import KnownAttribution, change_default_attribution
|
|
45
45
|
from datahub.telemetry import stats
|
|
46
46
|
from datahub.telemetry.telemetry import telemetry_instance
|
|
47
|
+
from datahub.upgrade.upgrade import (
|
|
48
|
+
is_server_default_cli_ahead,
|
|
49
|
+
retrieve_version_stats,
|
|
50
|
+
)
|
|
47
51
|
from datahub.utilities._custom_package_loader import model_version_name
|
|
48
52
|
from datahub.utilities.global_warning_util import (
|
|
49
53
|
clear_global_warnings,
|
|
@@ -171,7 +175,10 @@ class Pipeline:
|
|
|
171
175
|
self.last_time_printed = int(time.time())
|
|
172
176
|
self.cli_report = CliReport()
|
|
173
177
|
|
|
174
|
-
with
|
|
178
|
+
with (
|
|
179
|
+
contextlib.ExitStack() as exit_stack,
|
|
180
|
+
contextlib.ExitStack() as inner_exit_stack,
|
|
181
|
+
):
|
|
175
182
|
self.graph: Optional[DataHubGraph] = None
|
|
176
183
|
with _add_init_error_context("connect to DataHub"):
|
|
177
184
|
if self.config.datahub_api:
|
|
@@ -340,6 +347,44 @@ class Pipeline:
|
|
|
340
347
|
except Exception as e:
|
|
341
348
|
logger.warning("Reporting failed on start", exc_info=e)
|
|
342
349
|
|
|
350
|
+
def _warn_old_cli_version(self) -> None:
|
|
351
|
+
"""
|
|
352
|
+
Check if the server default CLI version is ahead of the CLI version being used.
|
|
353
|
+
If so, add a warning to the report.
|
|
354
|
+
"""
|
|
355
|
+
|
|
356
|
+
try:
|
|
357
|
+
version_stats = retrieve_version_stats(timeout=2.0, graph=self.graph)
|
|
358
|
+
except RuntimeError as e:
|
|
359
|
+
# Handle case where there's no event loop available (e.g., in ThreadPoolExecutor)
|
|
360
|
+
if "no current event loop" in str(e):
|
|
361
|
+
logger.debug("Skipping version check - no event loop available")
|
|
362
|
+
return
|
|
363
|
+
raise
|
|
364
|
+
|
|
365
|
+
if not version_stats or not self.graph:
|
|
366
|
+
return
|
|
367
|
+
|
|
368
|
+
if is_server_default_cli_ahead(version_stats):
|
|
369
|
+
server_default_version = (
|
|
370
|
+
version_stats.server.current_server_default_cli_version.version
|
|
371
|
+
if version_stats.server.current_server_default_cli_version
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
374
|
+
current_version = version_stats.client.current.version
|
|
375
|
+
|
|
376
|
+
logger.debug(f"""
|
|
377
|
+
client_version: {current_version}
|
|
378
|
+
server_default_version: {server_default_version}
|
|
379
|
+
server_default_cli_ahead: True
|
|
380
|
+
""")
|
|
381
|
+
|
|
382
|
+
self.source.get_report().warning(
|
|
383
|
+
title="Server default CLI version is ahead of CLI version",
|
|
384
|
+
message="Please upgrade the CLI version being used",
|
|
385
|
+
context=f"Server Default CLI version: {server_default_version}, Used CLI version: {current_version}",
|
|
386
|
+
)
|
|
387
|
+
|
|
343
388
|
def _notify_reporters_on_ingestion_completion(self) -> None:
|
|
344
389
|
for reporter in self.reporters:
|
|
345
390
|
try:
|
|
@@ -396,6 +441,7 @@ class Pipeline:
|
|
|
396
441
|
return False
|
|
397
442
|
|
|
398
443
|
def run(self) -> None:
|
|
444
|
+
self._warn_old_cli_version()
|
|
399
445
|
with self.exit_stack, self.inner_exit_stack:
|
|
400
446
|
if self.config.flags.generate_memory_profiles:
|
|
401
447
|
import memray
|
|
@@ -502,7 +548,7 @@ class Pipeline:
|
|
|
502
548
|
self._handle_uncaught_pipeline_exception(exc)
|
|
503
549
|
finally:
|
|
504
550
|
clear_global_warnings()
|
|
505
|
-
|
|
551
|
+
self.sink.flush()
|
|
506
552
|
self._notify_reporters_on_ingestion_completion()
|
|
507
553
|
|
|
508
554
|
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
|
@@ -578,11 +624,17 @@ class Pipeline:
|
|
|
578
624
|
sink_failures = len(self.sink.get_report().failures)
|
|
579
625
|
sink_warnings = len(self.sink.get_report().warnings)
|
|
580
626
|
global_warnings = len(get_global_warnings())
|
|
627
|
+
source_aspects = self.source.get_report().get_aspects_dict()
|
|
628
|
+
source_aspects_by_subtype = (
|
|
629
|
+
self.source.get_report().get_aspects_by_subtypes_dict()
|
|
630
|
+
)
|
|
581
631
|
|
|
582
632
|
telemetry_instance.ping(
|
|
583
633
|
"ingest_stats",
|
|
584
634
|
{
|
|
585
635
|
"source_type": self.source_type,
|
|
636
|
+
"source_aspects": source_aspects,
|
|
637
|
+
"source_aspects_by_subtype": source_aspects_by_subtype,
|
|
586
638
|
"sink_type": self.sink_type,
|
|
587
639
|
"transformer_types": [
|
|
588
640
|
transformer.type for transformer in self.config.transformers or []
|
|
@@ -5,6 +5,7 @@ import functools
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import threading
|
|
8
|
+
import time
|
|
8
9
|
import uuid
|
|
9
10
|
from enum import auto
|
|
10
11
|
from typing import List, Optional, Tuple, Union
|
|
@@ -70,6 +71,7 @@ _DEFAULT_REST_SINK_MODE = pydantic.parse_obj_as(
|
|
|
70
71
|
class DatahubRestSinkConfig(DatahubClientConfig):
|
|
71
72
|
mode: RestSinkMode = _DEFAULT_REST_SINK_MODE
|
|
72
73
|
endpoint: RestSinkEndpoint = DEFAULT_REST_EMITTER_ENDPOINT
|
|
74
|
+
server_config_refresh_interval: Optional[int] = None
|
|
73
75
|
|
|
74
76
|
# These only apply in async modes.
|
|
75
77
|
max_threads: pydantic.PositiveInt = _DEFAULT_REST_SINK_MAX_THREADS
|
|
@@ -345,6 +347,17 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|
|
345
347
|
RecordEnvelope(item, metadata={}), NoopWriteCallback()
|
|
346
348
|
)
|
|
347
349
|
|
|
350
|
+
def flush(self) -> None:
|
|
351
|
+
"""Wait for all pending records to be written."""
|
|
352
|
+
i = 0
|
|
353
|
+
while self.report.pending_requests > 0:
|
|
354
|
+
time.sleep(0.1)
|
|
355
|
+
i += 1
|
|
356
|
+
if i % 1000 == 0:
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Waiting for {self.report.pending_requests} records to be written"
|
|
359
|
+
)
|
|
360
|
+
|
|
348
361
|
def close(self):
|
|
349
362
|
with self.report.main_thread_blocking_timer:
|
|
350
363
|
self.executor.shutdown()
|
|
@@ -533,7 +533,7 @@ class ABSSource(StatefulIngestionSourceBase):
|
|
|
533
533
|
)
|
|
534
534
|
path_spec.sample_files = False
|
|
535
535
|
for obj in container_client.list_blobs(
|
|
536
|
-
|
|
536
|
+
name_starts_with=f"{prefix}", results_per_page=PAGE_SIZE
|
|
537
537
|
):
|
|
538
538
|
abs_path = self.create_abs_path(obj.name)
|
|
539
539
|
logger.debug(f"Path: {abs_path}")
|
|
@@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
25
|
from mypy_boto3_dynamodb import DynamoDBClient
|
|
26
26
|
from mypy_boto3_glue import GlueClient
|
|
27
|
+
from mypy_boto3_lakeformation import LakeFormationClient
|
|
27
28
|
from mypy_boto3_s3 import S3Client, S3ServiceResource
|
|
28
29
|
from mypy_boto3_sagemaker import SageMakerClient
|
|
29
30
|
from mypy_boto3_sts import STSClient
|
|
@@ -454,6 +455,9 @@ class AwsConnectionConfig(ConfigModel):
|
|
|
454
455
|
def get_sagemaker_client(self) -> "SageMakerClient":
|
|
455
456
|
return self.get_session().client("sagemaker", config=self._aws_config())
|
|
456
457
|
|
|
458
|
+
def get_lakeformation_client(self) -> "LakeFormationClient":
|
|
459
|
+
return self.get_session().client("lakeformation", config=self._aws_config())
|
|
460
|
+
|
|
457
461
|
|
|
458
462
|
class AwsSourceConfig(EnvConfigMixin, AwsConnectionConfig):
|
|
459
463
|
"""
|