acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -471,7 +471,10 @@ def get_view_file_path(
|
|
|
471
471
|
logger.debug("Entered")
|
|
472
472
|
|
|
473
473
|
for field in lkml_fields:
|
|
474
|
-
if
|
|
474
|
+
if (
|
|
475
|
+
LookerUtil.extract_view_name_from_lookml_model_explore_field(field)
|
|
476
|
+
== view_name
|
|
477
|
+
):
|
|
475
478
|
# This path is relative to git clone directory
|
|
476
479
|
logger.debug(f"Found view({view_name}) file-path {field.source_file}")
|
|
477
480
|
return field.source_file
|
|
@@ -1103,7 +1106,7 @@ class LookerExplore:
|
|
|
1103
1106
|
[column_ref] if column_ref is not None else []
|
|
1104
1107
|
)
|
|
1105
1108
|
|
|
1106
|
-
|
|
1109
|
+
looker_explore = cls(
|
|
1107
1110
|
name=explore_name,
|
|
1108
1111
|
model_name=model,
|
|
1109
1112
|
project_name=explore.project_name,
|
|
@@ -1121,6 +1124,8 @@ class LookerExplore:
|
|
|
1121
1124
|
source_file=explore.source_file,
|
|
1122
1125
|
tags=list(explore.tags) if explore.tags is not None else [],
|
|
1123
1126
|
)
|
|
1127
|
+
logger.debug(f"Created LookerExplore from API: {looker_explore}")
|
|
1128
|
+
return looker_explore
|
|
1124
1129
|
except SDKError as e:
|
|
1125
1130
|
if "<title>Looker Not Found (404)</title>" in str(e):
|
|
1126
1131
|
logger.info(
|
|
@@ -1161,6 +1166,9 @@ class LookerExplore:
|
|
|
1161
1166
|
dataset_name = config.explore_naming_pattern.replace_variables(
|
|
1162
1167
|
self.get_mapping(config)
|
|
1163
1168
|
)
|
|
1169
|
+
logger.debug(
|
|
1170
|
+
f"Generated dataset_name={dataset_name} for explore with model_name={self.model_name}, name={self.name}"
|
|
1171
|
+
)
|
|
1164
1172
|
|
|
1165
1173
|
return builder.make_dataset_urn_with_platform_instance(
|
|
1166
1174
|
platform=config.platform_name,
|
|
@@ -1362,6 +1370,7 @@ class LookerExploreRegistry:
|
|
|
1362
1370
|
|
|
1363
1371
|
@lru_cache(maxsize=200)
|
|
1364
1372
|
def get_explore(self, model: str, explore: str) -> Optional[LookerExplore]:
|
|
1373
|
+
logger.debug(f"Retrieving explore: model={model}, explore={explore}")
|
|
1365
1374
|
looker_explore = LookerExplore.from_api(
|
|
1366
1375
|
model,
|
|
1367
1376
|
explore,
|
|
@@ -1369,6 +1378,12 @@ class LookerExploreRegistry:
|
|
|
1369
1378
|
self.report,
|
|
1370
1379
|
self.source_config,
|
|
1371
1380
|
)
|
|
1381
|
+
if looker_explore is not None:
|
|
1382
|
+
logger.debug(
|
|
1383
|
+
f"Found explore with model_name={looker_explore.model_name}, name={looker_explore.name}"
|
|
1384
|
+
)
|
|
1385
|
+
else:
|
|
1386
|
+
logger.debug(f"No explore found for model={model}, explore={explore}")
|
|
1372
1387
|
return looker_explore
|
|
1373
1388
|
|
|
1374
1389
|
def compute_stats(self) -> Dict:
|
|
@@ -113,7 +113,7 @@ class LookerAPI:
|
|
|
113
113
|
)
|
|
114
114
|
except SDKError as e:
|
|
115
115
|
raise ConfigurationError(
|
|
116
|
-
|
|
116
|
+
"Failed to connect/authenticate with looker - check your configuration"
|
|
117
117
|
) from e
|
|
118
118
|
|
|
119
119
|
self.client_stats = LookerAPIStats()
|
|
@@ -279,6 +279,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
279
279
|
return []
|
|
280
280
|
result = []
|
|
281
281
|
|
|
282
|
+
if query is not None:
|
|
283
|
+
logger.debug(
|
|
284
|
+
f"Processing query: model={query.model}, view={query.view}, input_fields_count={len(query.fields) if query.fields else 0}"
|
|
285
|
+
)
|
|
286
|
+
|
|
282
287
|
# query.dynamic_fields can contain:
|
|
283
288
|
# - looker table calculations: https://docs.looker.com/exploring-data/using-table-calculations
|
|
284
289
|
# - looker custom measures: https://docs.looker.com/de/exploring-data/adding-fields/custom-measure
|
|
@@ -399,9 +404,12 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
399
404
|
# Get the explore from the view directly
|
|
400
405
|
explores = [element.query.view] if element.query.view is not None else []
|
|
401
406
|
logger.debug(
|
|
402
|
-
f"
|
|
407
|
+
f"Dashboard element {element.title} (ID: {element.id}): Upstream explores added via query={explores} with model={element.query.model}, explore={element.query.view}"
|
|
403
408
|
)
|
|
404
409
|
for exp in explores:
|
|
410
|
+
logger.debug(
|
|
411
|
+
f"Adding reachable explore: model={element.query.model}, explore={exp}, element_id={element.id}, title={element.title}"
|
|
412
|
+
)
|
|
405
413
|
self.add_reachable_explore(
|
|
406
414
|
model=element.query.model,
|
|
407
415
|
explore=exp,
|
|
@@ -477,12 +485,10 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
477
485
|
|
|
478
486
|
# Failing the above two approaches, pick out details from result_maker
|
|
479
487
|
elif element.result_maker is not None:
|
|
480
|
-
model: str = ""
|
|
481
488
|
input_fields = []
|
|
482
489
|
|
|
483
490
|
explores = []
|
|
484
491
|
if element.result_maker.query is not None:
|
|
485
|
-
model = element.result_maker.query.model
|
|
486
492
|
if element.result_maker.query.view is not None:
|
|
487
493
|
explores.append(element.result_maker.query.view)
|
|
488
494
|
input_fields = self._get_input_fields_from_query(
|
|
@@ -502,9 +508,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
502
508
|
|
|
503
509
|
# In addition to the query, filters can point to fields as well
|
|
504
510
|
assert element.result_maker.filterables is not None
|
|
511
|
+
|
|
512
|
+
# Different dashboard elements my reference explores from different models
|
|
513
|
+
# so we need to create a mapping of explore names to their models to maintain correct associations
|
|
514
|
+
explore_to_model_map = {}
|
|
515
|
+
|
|
505
516
|
for filterable in element.result_maker.filterables:
|
|
506
517
|
if filterable.view is not None and filterable.model is not None:
|
|
507
|
-
model
|
|
518
|
+
# Store the model for this view/explore in our mapping
|
|
519
|
+
explore_to_model_map[filterable.view] = filterable.model
|
|
508
520
|
explores.append(filterable.view)
|
|
509
521
|
self.add_reachable_explore(
|
|
510
522
|
model=filterable.model,
|
|
@@ -527,6 +539,18 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
527
539
|
|
|
528
540
|
explores = sorted(list(set(explores))) # dedup the list of views
|
|
529
541
|
|
|
542
|
+
logger.debug(
|
|
543
|
+
f"Dashboard element {element.id} and their explores with the corresponding model: {explore_to_model_map}"
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# If we have a query, use its model as the default for any explores that don't have a model in our mapping
|
|
547
|
+
default_model = ""
|
|
548
|
+
if (
|
|
549
|
+
element.result_maker.query is not None
|
|
550
|
+
and element.result_maker.query.model is not None
|
|
551
|
+
):
|
|
552
|
+
default_model = element.result_maker.query.model
|
|
553
|
+
|
|
530
554
|
return LookerDashboardElement(
|
|
531
555
|
id=element.id,
|
|
532
556
|
title=element.title if element.title is not None else "",
|
|
@@ -540,7 +564,11 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
540
564
|
else ""
|
|
541
565
|
),
|
|
542
566
|
upstream_explores=[
|
|
543
|
-
LookerExplore(
|
|
567
|
+
LookerExplore(
|
|
568
|
+
model_name=explore_to_model_map.get(exp, default_model),
|
|
569
|
+
name=exp,
|
|
570
|
+
)
|
|
571
|
+
for exp in explores
|
|
544
572
|
],
|
|
545
573
|
input_fields=input_fields,
|
|
546
574
|
owner=None,
|
|
@@ -1270,6 +1298,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1270
1298
|
chart_urn = self._make_chart_urn(
|
|
1271
1299
|
element_id=dashboard_element.get_urn_element_id()
|
|
1272
1300
|
)
|
|
1301
|
+
|
|
1273
1302
|
input_fields_aspect = InputFieldsClass(
|
|
1274
1303
|
fields=self._input_fields_from_dashboard_element(dashboard_element)
|
|
1275
1304
|
)
|
|
@@ -497,7 +497,13 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
497
497
|
f"Failed to find a project name for model {model_name}"
|
|
498
498
|
)
|
|
499
499
|
return model.project_name
|
|
500
|
-
except SDKError:
|
|
500
|
+
except SDKError as e:
|
|
501
|
+
self.reporter.failure(
|
|
502
|
+
title="Failed to find a project name for model",
|
|
503
|
+
message="Consider configuring a static project name in your config file",
|
|
504
|
+
context=str(dict(model_name=model_name)),
|
|
505
|
+
exc=e,
|
|
506
|
+
)
|
|
501
507
|
raise ValueError(
|
|
502
508
|
f"Could not locate a project name for model {model_name}. Consider configuring a static project name "
|
|
503
509
|
f"in your config file"
|
|
@@ -36,6 +36,7 @@ from datahub.ingestion.api.source_helpers import (
|
|
|
36
36
|
)
|
|
37
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
38
|
from datahub.ingestion.graph.client import get_default_graph
|
|
39
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
39
40
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
40
41
|
FineGrainedLineageDownstreamType,
|
|
41
42
|
FineGrainedLineageUpstreamType,
|
|
@@ -210,7 +211,7 @@ def _get_lineage_mcp(
|
|
|
210
211
|
|
|
211
212
|
# extract the old lineage and save it for the new mcp
|
|
212
213
|
if preserve_upstream:
|
|
213
|
-
client = get_default_graph()
|
|
214
|
+
client = get_default_graph(ClientMode.INGESTION)
|
|
214
215
|
|
|
215
216
|
old_upstream_lineage = get_aspects_for_entity(
|
|
216
217
|
client._session,
|
|
@@ -7,6 +7,7 @@ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Unio
|
|
|
7
7
|
from mlflow import MlflowClient
|
|
8
8
|
from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
|
|
9
9
|
from mlflow.entities.model_registry import ModelVersion, RegisteredModel
|
|
10
|
+
from mlflow.exceptions import MlflowException
|
|
10
11
|
from mlflow.store.entities import PagedList
|
|
11
12
|
from pydantic.fields import Field
|
|
12
13
|
|
|
@@ -589,8 +590,8 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
589
590
|
)
|
|
590
591
|
return runs
|
|
591
592
|
|
|
592
|
-
@staticmethod
|
|
593
593
|
def _traverse_mlflow_search_func(
|
|
594
|
+
self,
|
|
594
595
|
search_func: Callable[..., PagedList[T]],
|
|
595
596
|
**kwargs: Any,
|
|
596
597
|
) -> Iterable[T]:
|
|
@@ -598,12 +599,24 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
598
599
|
Utility to traverse an MLflow search_* functions which return PagedList.
|
|
599
600
|
"""
|
|
600
601
|
next_page_token = None
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
602
|
+
try:
|
|
603
|
+
while True:
|
|
604
|
+
paged_list = search_func(page_token=next_page_token, **kwargs)
|
|
605
|
+
yield from paged_list.to_list()
|
|
606
|
+
next_page_token = paged_list.token
|
|
607
|
+
if not next_page_token:
|
|
608
|
+
return
|
|
609
|
+
except MlflowException as e:
|
|
610
|
+
if e.error_code == "ENDPOINT_NOT_FOUND":
|
|
611
|
+
self.report.warning(
|
|
612
|
+
title="MLflow API Endpoint Not Found for Experiments.",
|
|
613
|
+
message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
|
|
614
|
+
context=None,
|
|
615
|
+
exc=e,
|
|
616
|
+
)
|
|
606
617
|
return
|
|
618
|
+
else:
|
|
619
|
+
raise # Only re-raise other exceptions
|
|
607
620
|
|
|
608
621
|
def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
|
|
609
622
|
return (
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from functools import lru_cache
|
|
8
8
|
from json import JSONDecodeError
|
|
9
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
9
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import dateutil.parser as dp
|
|
12
12
|
import pydantic
|
|
@@ -203,6 +203,10 @@ class HTTPError429(HTTPError):
|
|
|
203
203
|
pass
|
|
204
204
|
|
|
205
205
|
|
|
206
|
+
class HTTPError504(HTTPError):
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
|
|
206
210
|
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
207
211
|
|
|
208
212
|
|
|
@@ -217,6 +221,9 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
217
221
|
num_query_template_render: int = 0
|
|
218
222
|
num_query_template_render_failures: int = 0
|
|
219
223
|
num_query_template_render_success: int = 0
|
|
224
|
+
num_requests_exceeding_rate_limit: int = 0
|
|
225
|
+
num_requests_retried_on_timeout: int = 0
|
|
226
|
+
num_spaces_retrieved: int = 0
|
|
220
227
|
|
|
221
228
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
222
229
|
self.filtered_spaces.append(ent_name)
|
|
@@ -456,9 +463,23 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
456
463
|
# Datasets
|
|
457
464
|
datasets = []
|
|
458
465
|
for imported_dataset_name in report_info.get("imported_datasets", {}):
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
466
|
+
try:
|
|
467
|
+
mode_dataset = self._get_request_json(
|
|
468
|
+
f"{self.workspace_uri}/reports/{imported_dataset_name.get('token')}"
|
|
469
|
+
)
|
|
470
|
+
except HTTPError as http_error:
|
|
471
|
+
status_code = http_error.response.status_code
|
|
472
|
+
if status_code == 404:
|
|
473
|
+
self.report.report_warning(
|
|
474
|
+
title="Report Not Found",
|
|
475
|
+
message="Referenced report for reusable dataset was not found.",
|
|
476
|
+
context=f"Report: {report_info.get('id')}, "
|
|
477
|
+
f"Imported Dataset Report: {imported_dataset_name.get('token')}",
|
|
478
|
+
)
|
|
479
|
+
continue
|
|
480
|
+
else:
|
|
481
|
+
raise http_error
|
|
482
|
+
|
|
462
483
|
dataset_urn = builder.make_dataset_urn_with_platform_instance(
|
|
463
484
|
self.platform,
|
|
464
485
|
str(mode_dataset.get("id")),
|
|
@@ -562,29 +583,34 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
562
583
|
space_info = {}
|
|
563
584
|
try:
|
|
564
585
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
)
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
586
|
+
for spaces_page in self._get_paged_request_json(
|
|
587
|
+
f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
|
|
588
|
+
):
|
|
589
|
+
logger.debug(
|
|
590
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
591
|
+
)
|
|
592
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
593
|
+
for s in spaces_page:
|
|
594
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
595
|
+
space_name = s.get("name", "")
|
|
596
|
+
# Using both restricted and default_access_level because
|
|
597
|
+
# there is a current bug with restricted returning False everytime
|
|
598
|
+
# which has been reported to Mode team
|
|
599
|
+
if self.config.exclude_restricted and (
|
|
600
|
+
s.get("restricted")
|
|
601
|
+
or s.get("default_access_level") == "restricted"
|
|
602
|
+
):
|
|
603
|
+
logging.debug(
|
|
604
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
605
|
+
)
|
|
606
|
+
continue
|
|
607
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
608
|
+
self.report.report_dropped_space(space_name)
|
|
609
|
+
logging.debug(
|
|
610
|
+
f"Skipping space {space_name} due to space pattern"
|
|
611
|
+
)
|
|
612
|
+
continue
|
|
613
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
588
614
|
except ModeRequestError as e:
|
|
589
615
|
self.report.report_failure(
|
|
590
616
|
title="Failed to Retrieve Spaces",
|
|
@@ -1475,13 +1501,28 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1475
1501
|
)
|
|
1476
1502
|
return charts
|
|
1477
1503
|
|
|
1504
|
+
def _get_paged_request_json(
|
|
1505
|
+
self, url: str, key: str, per_page: int
|
|
1506
|
+
) -> Iterator[List[Dict]]:
|
|
1507
|
+
page: int = 1
|
|
1508
|
+
while True:
|
|
1509
|
+
page_url = f"{url}&per_page={per_page}&page={page}"
|
|
1510
|
+
response = self._get_request_json(page_url)
|
|
1511
|
+
data: List[Dict] = response.get("_embedded", {}).get(key, [])
|
|
1512
|
+
if not data:
|
|
1513
|
+
break
|
|
1514
|
+
yield data
|
|
1515
|
+
page += 1
|
|
1516
|
+
|
|
1478
1517
|
def _get_request_json(self, url: str) -> Dict:
|
|
1479
1518
|
r = tenacity.Retrying(
|
|
1480
1519
|
wait=wait_exponential(
|
|
1481
1520
|
multiplier=self.config.api_options.retry_backoff_multiplier,
|
|
1482
1521
|
max=self.config.api_options.max_retry_interval,
|
|
1483
1522
|
),
|
|
1484
|
-
retry=retry_if_exception_type(
|
|
1523
|
+
retry=retry_if_exception_type(
|
|
1524
|
+
(HTTPError429, HTTPError504, ConnectionError)
|
|
1525
|
+
),
|
|
1485
1526
|
stop=stop_after_attempt(self.config.api_options.max_attempts),
|
|
1486
1527
|
)
|
|
1487
1528
|
|
|
@@ -1502,11 +1543,16 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1502
1543
|
except HTTPError as http_error:
|
|
1503
1544
|
error_response = http_error.response
|
|
1504
1545
|
if error_response.status_code == 429:
|
|
1546
|
+
self.report.num_requests_exceeding_rate_limit += 1
|
|
1505
1547
|
# respect Retry-After
|
|
1506
1548
|
sleep_time = error_response.headers.get("retry-after")
|
|
1507
1549
|
if sleep_time is not None:
|
|
1508
1550
|
time.sleep(float(sleep_time))
|
|
1509
1551
|
raise HTTPError429 from None
|
|
1552
|
+
elif error_response.status_code == 504:
|
|
1553
|
+
self.report.num_requests_retried_on_timeout += 1
|
|
1554
|
+
time.sleep(0.1)
|
|
1555
|
+
raise HTTPError504 from None
|
|
1510
1556
|
|
|
1511
1557
|
logger.debug(
|
|
1512
1558
|
f"Error response ({error_response.status_code}): {error_response.text}"
|
|
@@ -5,27 +5,35 @@ from typing import Any, Dict, Iterable, List, Optional, Type, Union
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from neo4j import GraphDatabase
|
|
8
|
-
from pydantic
|
|
8
|
+
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.source_common import (
|
|
11
11
|
EnvConfigMixin,
|
|
12
|
+
PlatformInstanceConfigMixin,
|
|
13
|
+
)
|
|
14
|
+
from datahub.emitter.mce_builder import (
|
|
15
|
+
make_data_platform_urn,
|
|
16
|
+
make_dataset_urn_with_platform_instance,
|
|
12
17
|
)
|
|
13
|
-
from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
|
|
14
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
16
20
|
from datahub.ingestion.api.decorators import (
|
|
17
21
|
SupportStatus,
|
|
22
|
+
capability,
|
|
18
23
|
config_class,
|
|
19
24
|
platform_name,
|
|
20
25
|
support_status,
|
|
21
26
|
)
|
|
22
27
|
from datahub.ingestion.api.source import (
|
|
23
28
|
MetadataWorkUnitProcessor,
|
|
29
|
+
SourceCapability,
|
|
24
30
|
)
|
|
31
|
+
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
25
32
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
33
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
27
34
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
28
35
|
StaleEntityRemovalHandler,
|
|
36
|
+
StatefulStaleMetadataRemovalConfig,
|
|
29
37
|
)
|
|
30
38
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
31
39
|
StatefulIngestionConfigBase,
|
|
@@ -64,12 +72,16 @@ _type_mapping: Dict[Union[Type, str], Type] = {
|
|
|
64
72
|
}
|
|
65
73
|
|
|
66
74
|
|
|
67
|
-
class Neo4jConfig(
|
|
75
|
+
class Neo4jConfig(
|
|
76
|
+
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
77
|
+
):
|
|
68
78
|
username: str = Field(description="Neo4j Username")
|
|
69
79
|
password: str = Field(description="Neo4j Password")
|
|
70
80
|
uri: str = Field(description="The URI for the Neo4j server")
|
|
71
81
|
env: str = Field(description="Neo4j env")
|
|
72
82
|
|
|
83
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
84
|
+
|
|
73
85
|
|
|
74
86
|
@dataclass
|
|
75
87
|
class Neo4jSourceReport(StatefulIngestionReport):
|
|
@@ -79,21 +91,27 @@ class Neo4jSourceReport(StatefulIngestionReport):
|
|
|
79
91
|
|
|
80
92
|
@platform_name("Neo4j", id="neo4j")
|
|
81
93
|
@config_class(Neo4jConfig)
|
|
94
|
+
@capability(
|
|
95
|
+
SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
|
|
96
|
+
)
|
|
82
97
|
@support_status(SupportStatus.CERTIFIED)
|
|
83
98
|
class Neo4jSource(StatefulIngestionSourceBase):
|
|
84
99
|
NODE = "node"
|
|
85
100
|
RELATIONSHIP = "relationship"
|
|
86
|
-
|
|
101
|
+
config: Neo4jConfig
|
|
102
|
+
report: Neo4jSourceReport
|
|
87
103
|
|
|
88
|
-
def __init__(self,
|
|
104
|
+
def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
|
|
105
|
+
super().__init__(config, ctx)
|
|
89
106
|
self.ctx = ctx
|
|
90
107
|
self.config = config
|
|
108
|
+
self.platform = "neo4j"
|
|
91
109
|
self.report: Neo4jSourceReport = Neo4jSourceReport()
|
|
92
110
|
|
|
93
111
|
@classmethod
|
|
94
|
-
def create(cls, config_dict, ctx):
|
|
112
|
+
def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
|
|
95
113
|
config = Neo4jConfig.parse_obj(config_dict)
|
|
96
|
-
return cls(
|
|
114
|
+
return cls(config, ctx)
|
|
97
115
|
|
|
98
116
|
def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
|
|
99
117
|
type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
|
|
@@ -123,34 +141,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
123
141
|
dataset: str,
|
|
124
142
|
description: Optional[str] = None,
|
|
125
143
|
custom_properties: Optional[Dict[str, str]] = None,
|
|
126
|
-
) ->
|
|
144
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
127
145
|
dataset_properties = DatasetPropertiesClass(
|
|
128
146
|
description=description,
|
|
129
147
|
customProperties=custom_properties,
|
|
130
148
|
)
|
|
131
|
-
|
|
132
|
-
entityUrn=
|
|
133
|
-
platform=self.
|
|
149
|
+
yield MetadataChangeProposalWrapper(
|
|
150
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
151
|
+
platform=self.platform,
|
|
152
|
+
name=dataset,
|
|
153
|
+
platform_instance=self.config.platform_instance,
|
|
154
|
+
env=self.config.env,
|
|
134
155
|
),
|
|
135
156
|
aspect=dataset_properties,
|
|
136
|
-
)
|
|
157
|
+
).as_workunit()
|
|
137
158
|
|
|
138
159
|
def generate_neo4j_object(
|
|
139
160
|
self, dataset: str, columns: list, obj_type: Optional[str] = None
|
|
140
|
-
) -> MetadataChangeProposalWrapper:
|
|
161
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
141
162
|
try:
|
|
142
163
|
fields = [
|
|
143
164
|
self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
|
|
144
165
|
for d in columns
|
|
145
166
|
for key, value in d.items()
|
|
146
167
|
]
|
|
147
|
-
|
|
148
|
-
entityUrn=
|
|
149
|
-
platform=self.
|
|
168
|
+
return MetadataChangeProposalWrapper(
|
|
169
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
170
|
+
platform=self.platform,
|
|
171
|
+
name=dataset,
|
|
172
|
+
platform_instance=self.config.platform_instance,
|
|
173
|
+
env=self.config.env,
|
|
150
174
|
),
|
|
151
175
|
aspect=SchemaMetadataClass(
|
|
152
176
|
schemaName=dataset,
|
|
153
|
-
platform=make_data_platform_urn(self.
|
|
177
|
+
platform=make_data_platform_urn(self.platform),
|
|
154
178
|
version=0,
|
|
155
179
|
hash="",
|
|
156
180
|
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
@@ -161,13 +185,16 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
161
185
|
fields=fields,
|
|
162
186
|
),
|
|
163
187
|
)
|
|
164
|
-
self.report.obj_created += 1
|
|
165
188
|
except Exception as e:
|
|
166
189
|
log.error(e)
|
|
167
|
-
self.report.
|
|
168
|
-
|
|
190
|
+
self.report.report_failure(
|
|
191
|
+
message="Failed to process dataset",
|
|
192
|
+
context=dataset,
|
|
193
|
+
exc=e,
|
|
194
|
+
)
|
|
195
|
+
return None
|
|
169
196
|
|
|
170
|
-
def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
|
|
197
|
+
def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
|
|
171
198
|
driver = GraphDatabase.driver(
|
|
172
199
|
self.config.uri, auth=(self.config.username, self.config.password)
|
|
173
200
|
)
|
|
@@ -201,13 +228,14 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
201
228
|
|
|
202
229
|
union_cols = ["key", "obj_type", "property_data_types", "description"]
|
|
203
230
|
df = pd.concat([node_df[union_cols], rel_df[union_cols]])
|
|
231
|
+
return df
|
|
204
232
|
except Exception as e:
|
|
205
233
|
self.report.failure(
|
|
206
234
|
message="Failed to get neo4j metadata",
|
|
207
235
|
exc=e,
|
|
208
236
|
)
|
|
209
237
|
|
|
210
|
-
return
|
|
238
|
+
return None
|
|
211
239
|
|
|
212
240
|
def process_nodes(self, data: list) -> pd.DataFrame:
|
|
213
241
|
nodes = [record for record in data if record["value"]["type"] == self.NODE]
|
|
@@ -306,46 +334,48 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
306
334
|
df = self.get_neo4j_metadata(
|
|
307
335
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
308
336
|
)
|
|
337
|
+
if df is None:
|
|
338
|
+
log.warning("No metadata retrieved from Neo4j")
|
|
339
|
+
return
|
|
340
|
+
|
|
309
341
|
for _, row in df.iterrows():
|
|
310
342
|
try:
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
columns=row["property_data_types"],
|
|
315
|
-
dataset=row["key"],
|
|
316
|
-
),
|
|
317
|
-
is_primary_source=True,
|
|
343
|
+
neo4j_obj = self.generate_neo4j_object(
|
|
344
|
+
columns=row["property_data_types"],
|
|
345
|
+
dataset=row["key"],
|
|
318
346
|
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
aspect=SubTypesClass(
|
|
329
|
-
typeNames=[
|
|
330
|
-
DatasetSubTypes.NEO4J_NODE
|
|
331
|
-
if row["obj_type"] == self.NODE
|
|
332
|
-
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
333
|
-
]
|
|
334
|
-
),
|
|
347
|
+
if neo4j_obj:
|
|
348
|
+
yield from auto_workunit([neo4j_obj])
|
|
349
|
+
|
|
350
|
+
yield MetadataChangeProposalWrapper(
|
|
351
|
+
entityUrn=make_dataset_urn_with_platform_instance(
|
|
352
|
+
platform=self.platform,
|
|
353
|
+
name=row["key"],
|
|
354
|
+
platform_instance=self.config.platform_instance,
|
|
355
|
+
env=self.config.env,
|
|
335
356
|
),
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
custom_properties=None,
|
|
343
|
-
description=row["description"],
|
|
357
|
+
aspect=SubTypesClass(
|
|
358
|
+
typeNames=[
|
|
359
|
+
DatasetSubTypes.NEO4J_NODE
|
|
360
|
+
if row["obj_type"] == self.NODE
|
|
361
|
+
else DatasetSubTypes.NEO4J_RELATIONSHIP
|
|
362
|
+
]
|
|
344
363
|
),
|
|
364
|
+
).as_workunit()
|
|
365
|
+
|
|
366
|
+
yield from self.add_properties(
|
|
367
|
+
dataset=row["key"],
|
|
368
|
+
custom_properties=None,
|
|
369
|
+
description=row["description"],
|
|
345
370
|
)
|
|
346
371
|
|
|
347
372
|
except Exception as e:
|
|
348
|
-
|
|
373
|
+
log.error(f"Failed to process row {row['key']}: {str(e)}")
|
|
374
|
+
self.report.report_failure(
|
|
375
|
+
message="Error processing Neo4j metadata",
|
|
376
|
+
context=row["key"],
|
|
377
|
+
exc=e,
|
|
378
|
+
)
|
|
349
379
|
|
|
350
|
-
def get_report(self):
|
|
380
|
+
def get_report(self) -> "Neo4jSourceReport":
|
|
351
381
|
return self.report
|