acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/METADATA +2378 -2380
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/RECORD +161 -161
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/delete_cli.py +16 -2
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/config.py +4 -0
- datahub/ingestion/source/fivetran/fivetran.py +15 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +5 -3
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/query.py +77 -47
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3rc1.dist-info}/top_level.txt +0 -0
|
@@ -38,16 +38,16 @@ def merge_parent_and_child_fields(
|
|
|
38
38
|
# Create a map field-name vs field
|
|
39
39
|
child_field_map: dict = {}
|
|
40
40
|
for field in child_fields:
|
|
41
|
-
assert (
|
|
42
|
-
|
|
43
|
-
)
|
|
41
|
+
assert NAME in field, (
|
|
42
|
+
"A lookml view must have a name field"
|
|
43
|
+
) # name is required field of lookml field array
|
|
44
44
|
|
|
45
45
|
child_field_map[field[NAME]] = field
|
|
46
46
|
|
|
47
47
|
for field in parent_fields:
|
|
48
|
-
assert (
|
|
49
|
-
|
|
50
|
-
)
|
|
48
|
+
assert NAME in field, (
|
|
49
|
+
"A lookml view must have a name field"
|
|
50
|
+
) # name is required field of lookml field array
|
|
51
51
|
|
|
52
52
|
if field[NAME] in child_field_map:
|
|
53
53
|
# Fields defined in the child view take higher precedence.
|
|
@@ -482,14 +482,14 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
482
482
|
if self.source_config.project_name is not None:
|
|
483
483
|
return self.source_config.project_name
|
|
484
484
|
|
|
485
|
-
assert (
|
|
486
|
-
|
|
487
|
-
)
|
|
485
|
+
assert self.looker_client is not None, (
|
|
486
|
+
"Failed to find a configured Looker API client"
|
|
487
|
+
)
|
|
488
488
|
try:
|
|
489
489
|
model = self.looker_client.lookml_model(model_name, fields="project_name")
|
|
490
|
-
assert (
|
|
491
|
-
|
|
492
|
-
)
|
|
490
|
+
assert model.project_name is not None, (
|
|
491
|
+
f"Failed to find a project name for model {model_name}"
|
|
492
|
+
)
|
|
493
493
|
return model.project_name
|
|
494
494
|
except SDKError:
|
|
495
495
|
raise ValueError(
|
|
@@ -541,9 +541,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
541
541
|
self.reporter.git_clone_latency = datetime.now() - start_time
|
|
542
542
|
self.source_config.base_folder = checkout_dir.resolve()
|
|
543
543
|
|
|
544
|
-
self.base_projects_folder[
|
|
545
|
-
|
|
546
|
-
|
|
544
|
+
self.base_projects_folder[BASE_PROJECT_NAME] = (
|
|
545
|
+
self.source_config.base_folder
|
|
546
|
+
)
|
|
547
547
|
|
|
548
548
|
visited_projects: Set[str] = set()
|
|
549
549
|
|
|
@@ -641,9 +641,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
641
641
|
repo_url=remote_project.url,
|
|
642
642
|
)
|
|
643
643
|
|
|
644
|
-
self.base_projects_folder[
|
|
645
|
-
|
|
646
|
-
|
|
644
|
+
self.base_projects_folder[remote_project.name] = (
|
|
645
|
+
p_checkout_dir.resolve()
|
|
646
|
+
)
|
|
647
647
|
repo = p_cloner.get_last_repo_cloned()
|
|
648
648
|
assert repo
|
|
649
649
|
remote_git_info = GitInfo(
|
|
@@ -930,9 +930,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
930
930
|
logger.warning(
|
|
931
931
|
f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables"
|
|
932
932
|
)
|
|
933
|
-
if
|
|
934
|
-
not self.source_config.emit_reachable_views_only
|
|
935
|
-
):
|
|
933
|
+
if not self.source_config.emit_reachable_views_only:
|
|
936
934
|
logger.warning(
|
|
937
935
|
"Consider enabling the `emit_reachable_views_only` flag to handle this case."
|
|
938
936
|
)
|
|
@@ -484,11 +484,11 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
484
484
|
)
|
|
485
485
|
|
|
486
486
|
def __get_upstream_dataset_urn(self) -> List[str]:
|
|
487
|
-
current_view_id: Optional[
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
487
|
+
current_view_id: Optional[LookerViewId] = (
|
|
488
|
+
self.looker_view_id_cache.get_looker_view_id(
|
|
489
|
+
view_name=self.view_context.name(),
|
|
490
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
491
|
+
)
|
|
492
492
|
)
|
|
493
493
|
|
|
494
494
|
# Current view will always be present in cache. assert will silence the lint
|
|
@@ -172,10 +172,10 @@ class MLflowSource(Source):
|
|
|
172
172
|
"""
|
|
173
173
|
Get all Registered Models in MLflow Model Registry.
|
|
174
174
|
"""
|
|
175
|
-
registered_models: Iterable[
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
registered_models: Iterable[RegisteredModel] = (
|
|
176
|
+
self._traverse_mlflow_search_func(
|
|
177
|
+
search_func=self.client.search_registered_models,
|
|
178
|
+
)
|
|
179
179
|
)
|
|
180
180
|
return registered_models
|
|
181
181
|
|
|
@@ -288,7 +288,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
288
288
|
|
|
289
289
|
# See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes
|
|
290
290
|
self.mongo_client = MongoClient(
|
|
291
|
-
self.config.connect_uri,
|
|
291
|
+
self.config.connect_uri,
|
|
292
|
+
datetime_conversion="DATETIME_AUTO",
|
|
293
|
+
**options, # type: ignore
|
|
292
294
|
)
|
|
293
295
|
|
|
294
296
|
# This cheaply tests the connection. For details, see
|
|
@@ -470,9 +472,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
470
472
|
)
|
|
471
473
|
# Add this information to the custom properties so user can know they are looking at downsampled schema
|
|
472
474
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
473
|
-
dataset_properties.customProperties[
|
|
474
|
-
"
|
|
475
|
-
|
|
475
|
+
dataset_properties.customProperties["schema.totalFields"] = (
|
|
476
|
+
f"{collection_schema_size}"
|
|
477
|
+
)
|
|
476
478
|
|
|
477
479
|
logger.debug(f"Size of collection fields = {len(collection_fields)}")
|
|
478
480
|
# append each schema field (sort so output is consistent)
|
|
@@ -286,7 +286,7 @@ class Neo4jSource(Source):
|
|
|
286
286
|
df = self.get_neo4j_metadata(
|
|
287
287
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
288
|
)
|
|
289
|
-
for
|
|
289
|
+
for _, row in df.iterrows():
|
|
290
290
|
try:
|
|
291
291
|
yield MetadataWorkUnit(
|
|
292
292
|
id=row["key"],
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -184,9 +184,9 @@ class NifiSourceConfig(EnvConfigMixin):
|
|
|
184
184
|
|
|
185
185
|
@validator("site_url")
|
|
186
186
|
def validator_site_url(cls, site_url: str) -> str:
|
|
187
|
-
assert site_url.startswith(
|
|
188
|
-
|
|
189
|
-
)
|
|
187
|
+
assert site_url.startswith(("http://", "https://")), (
|
|
188
|
+
"site_url must start with http:// or https://"
|
|
189
|
+
)
|
|
190
190
|
|
|
191
191
|
if not site_url.endswith("/"):
|
|
192
192
|
site_url = site_url + "/"
|
|
@@ -487,9 +487,7 @@ class NifiSource(Source):
|
|
|
487
487
|
def get_report(self) -> SourceReport:
|
|
488
488
|
return self.report
|
|
489
489
|
|
|
490
|
-
def update_flow(
|
|
491
|
-
self, pg_flow_dto: Dict, recursion_level: int = 0
|
|
492
|
-
) -> None: # noqa: C901
|
|
490
|
+
def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901
|
|
493
491
|
"""
|
|
494
492
|
Update self.nifi_flow with contents of the input process group `pg_flow_dto`
|
|
495
493
|
"""
|
|
@@ -548,16 +546,16 @@ class NifiSource(Source):
|
|
|
548
546
|
for inputPort in flow_dto.get("inputPorts", []):
|
|
549
547
|
component = inputPort.get("component")
|
|
550
548
|
if inputPort.get("allowRemoteAccess"):
|
|
551
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
549
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
550
|
+
NifiComponent(
|
|
551
|
+
component.get("id"),
|
|
552
|
+
component.get("name"),
|
|
553
|
+
component.get("type"),
|
|
554
|
+
component.get("parentGroupId"),
|
|
555
|
+
NifiType.INPUT_PORT,
|
|
556
|
+
comments=component.get("comments"),
|
|
557
|
+
status=component.get("status", {}).get("runStatus"),
|
|
558
|
+
)
|
|
561
559
|
)
|
|
562
560
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
563
561
|
else:
|
|
@@ -576,16 +574,16 @@ class NifiSource(Source):
|
|
|
576
574
|
for outputPort in flow_dto.get("outputPorts", []):
|
|
577
575
|
component = outputPort.get("component")
|
|
578
576
|
if outputPort.get("allowRemoteAccess"):
|
|
579
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
577
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
578
|
+
NifiComponent(
|
|
579
|
+
component.get("id"),
|
|
580
|
+
component.get("name"),
|
|
581
|
+
component.get("type"),
|
|
582
|
+
component.get("parentGroupId"),
|
|
583
|
+
NifiType.OUTPUT_PORT,
|
|
584
|
+
comments=component.get("comments"),
|
|
585
|
+
status=component.get("status", {}).get("runStatus"),
|
|
586
|
+
)
|
|
589
587
|
)
|
|
590
588
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
591
589
|
else:
|
|
@@ -101,16 +101,16 @@ class OpenApiConfig(ConfigModel):
|
|
|
101
101
|
# details there once, and then use that session for all requests.
|
|
102
102
|
self.token = f"Bearer {self.bearer_token}"
|
|
103
103
|
else:
|
|
104
|
-
assert (
|
|
105
|
-
"url_complement
|
|
106
|
-
)
|
|
104
|
+
assert "url_complement" in self.get_token.keys(), (
|
|
105
|
+
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
106
|
+
)
|
|
107
107
|
if self.get_token["request_type"] == "get":
|
|
108
|
-
assert (
|
|
109
|
-
"{username}
|
|
110
|
-
)
|
|
111
|
-
assert (
|
|
112
|
-
"{password}
|
|
113
|
-
)
|
|
108
|
+
assert "{username}" in self.get_token["url_complement"], (
|
|
109
|
+
"we expect the keyword {username} to be present in the url"
|
|
110
|
+
)
|
|
111
|
+
assert "{password}" in self.get_token["url_complement"], (
|
|
112
|
+
"we expect the keyword {password} to be present in the url"
|
|
113
|
+
)
|
|
114
114
|
url4req = self.get_token["url_complement"].replace(
|
|
115
115
|
"{username}", self.username
|
|
116
116
|
)
|
|
@@ -225,9 +225,9 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
225
225
|
def default_for_dataset_type_mapping() -> Dict[str, str]:
|
|
226
226
|
dict_: dict = {}
|
|
227
227
|
for item in SupportedDataPlatform:
|
|
228
|
-
dict_[
|
|
229
|
-
item.value.
|
|
230
|
-
|
|
228
|
+
dict_[item.value.powerbi_data_platform_name] = (
|
|
229
|
+
item.value.datahub_data_platform_name
|
|
230
|
+
)
|
|
231
231
|
|
|
232
232
|
return dict_
|
|
233
233
|
|
|
@@ -303,15 +303,15 @@ class PowerBiDashboardSourceConfig(
|
|
|
303
303
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
304
304
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
305
305
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
306
|
-
dataset_type_mapping: Union[
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
306
|
+
dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
|
|
307
|
+
pydantic.Field(
|
|
308
|
+
default_factory=default_for_dataset_type_mapping,
|
|
309
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
310
|
+
"DataHub supported datasources."
|
|
311
|
+
"You can configured platform instance for dataset lineage. "
|
|
312
|
+
"See Quickstart Recipe for mapping",
|
|
313
|
+
hidden_from_docs=True,
|
|
314
|
+
)
|
|
315
315
|
)
|
|
316
316
|
# PowerBI datasource's server to platform instance mapping
|
|
317
317
|
server_to_platform_instance: Dict[
|
|
@@ -128,17 +128,17 @@ def get_upstream_tables(
|
|
|
128
128
|
reporter.m_query_parse_successes += 1
|
|
129
129
|
|
|
130
130
|
try:
|
|
131
|
-
lineage: List[
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
131
|
+
lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = (
|
|
132
|
+
resolver.MQueryResolver(
|
|
133
|
+
table=table,
|
|
134
|
+
parse_tree=parse_tree,
|
|
135
|
+
reporter=reporter,
|
|
136
|
+
parameters=parameters,
|
|
137
|
+
).resolve_to_lineage(
|
|
138
|
+
ctx=ctx,
|
|
139
|
+
config=config,
|
|
140
|
+
platform_instance_resolver=platform_instance_resolver,
|
|
141
|
+
)
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
if lineage:
|
|
@@ -170,8 +170,7 @@ class AbstractLineage(ABC):
|
|
|
170
170
|
logger.debug(f"Processing arguments {arguments}")
|
|
171
171
|
|
|
172
172
|
if (
|
|
173
|
-
len(arguments)
|
|
174
|
-
>= 4 # [0] is warehouse FQDN.
|
|
173
|
+
len(arguments) >= 4 # [0] is warehouse FQDN.
|
|
175
174
|
# [1] is endpoint, we are not using it.
|
|
176
175
|
# [2] is "Catalog" key
|
|
177
176
|
# [3] is catalog's value
|
|
@@ -215,16 +214,16 @@ class AbstractLineage(ABC):
|
|
|
215
214
|
native_sql_parser.remove_special_characters(query)
|
|
216
215
|
)
|
|
217
216
|
|
|
218
|
-
parsed_result: Optional[
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
217
|
+
parsed_result: Optional["SqlParsingResult"] = (
|
|
218
|
+
native_sql_parser.parse_custom_sql(
|
|
219
|
+
ctx=self.ctx,
|
|
220
|
+
query=query,
|
|
221
|
+
platform=self.get_platform_pair().datahub_data_platform_name,
|
|
222
|
+
platform_instance=platform_detail.platform_instance,
|
|
223
|
+
env=platform_detail.env,
|
|
224
|
+
database=database,
|
|
225
|
+
schema=schema,
|
|
226
|
+
)
|
|
228
227
|
)
|
|
229
228
|
|
|
230
229
|
if parsed_result is None:
|
|
@@ -410,9 +409,9 @@ class DatabricksLineage(AbstractLineage):
|
|
|
410
409
|
f"Processing Databrick data-access function detail {data_access_func_detail}"
|
|
411
410
|
)
|
|
412
411
|
table_detail: Dict[str, str] = {}
|
|
413
|
-
temp_accessor: Optional[
|
|
414
|
-
|
|
415
|
-
|
|
412
|
+
temp_accessor: Optional[IdentifierAccessor] = (
|
|
413
|
+
data_access_func_detail.identifier_accessor
|
|
414
|
+
)
|
|
416
415
|
|
|
417
416
|
while temp_accessor:
|
|
418
417
|
# Condition to handle databricks M-query pattern where table, schema and database all are present in
|
|
@@ -647,11 +646,13 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
647
646
|
db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
|
|
648
647
|
# Second is schema name
|
|
649
648
|
schema_name: str = cast(
|
|
650
|
-
IdentifierAccessor,
|
|
649
|
+
IdentifierAccessor,
|
|
650
|
+
data_access_func_detail.identifier_accessor.next, # type: ignore
|
|
651
651
|
).items["Name"]
|
|
652
652
|
# Third is table name
|
|
653
653
|
table_name: str = cast(
|
|
654
|
-
IdentifierAccessor,
|
|
654
|
+
IdentifierAccessor,
|
|
655
|
+
data_access_func_detail.identifier_accessor.next.next, # type: ignore
|
|
655
656
|
).items["Name"]
|
|
656
657
|
|
|
657
658
|
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
@@ -768,10 +769,13 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
768
769
|
): # database name is explicitly set
|
|
769
770
|
return database
|
|
770
771
|
|
|
771
|
-
return
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
772
|
+
return (
|
|
773
|
+
get_next_item( # database name is set in Name argument
|
|
774
|
+
data_access_tokens, "Name"
|
|
775
|
+
)
|
|
776
|
+
or get_next_item( # If both above arguments are not available, then try Catalog
|
|
777
|
+
data_access_tokens, "Catalog"
|
|
778
|
+
)
|
|
775
779
|
)
|
|
776
780
|
|
|
777
781
|
def create_lineage(
|
|
@@ -819,9 +823,7 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
819
823
|
values=tree_function.remove_whitespaces_from_list(
|
|
820
824
|
tree_function.token_values(flat_argument_list[1])
|
|
821
825
|
),
|
|
822
|
-
)[
|
|
823
|
-
0
|
|
824
|
-
] # Remove any whitespaces and double quotes character
|
|
826
|
+
)[0] # Remove any whitespaces and double quotes character
|
|
825
827
|
|
|
826
828
|
server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
|
|
827
829
|
|
|
@@ -188,9 +188,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
188
188
|
# - The inner function Table.TransformColumnTypes takes #"Removed Columns1"
|
|
189
189
|
# (a table reference) as its first argument
|
|
190
190
|
# - Its result is then passed as the first argument to Table.SplitColumn
|
|
191
|
-
second_invoke_expression: Optional[
|
|
192
|
-
|
|
193
|
-
|
|
191
|
+
second_invoke_expression: Optional[Tree] = (
|
|
192
|
+
tree_function.first_invoke_expression_func(first_argument)
|
|
193
|
+
)
|
|
194
194
|
if second_invoke_expression:
|
|
195
195
|
# 1. The First argument is function call
|
|
196
196
|
# 2. That function's first argument references next table variable
|
|
@@ -304,14 +304,14 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
304
304
|
logger.debug(v_statement.pretty())
|
|
305
305
|
return None
|
|
306
306
|
|
|
307
|
-
invoke_expression: Optional[
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
invoke_expression: Optional[Tree] = (
|
|
308
|
+
tree_function.first_invoke_expression_func(rh_tree)
|
|
309
|
+
)
|
|
310
310
|
|
|
311
311
|
if invoke_expression is not None:
|
|
312
|
-
result: Union[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
result: Union[DataAccessFunctionDetail, List[str], None] = (
|
|
313
|
+
self._process_invoke_expression(invoke_expression)
|
|
314
|
+
)
|
|
315
315
|
if result is None:
|
|
316
316
|
return None # No need to process some un-expected grammar found while processing invoke_expression
|
|
317
317
|
if isinstance(result, DataAccessFunctionDetail):
|
|
@@ -368,9 +368,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
368
368
|
return lineage
|
|
369
369
|
|
|
370
370
|
# Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
|
|
371
|
-
table_links: List[
|
|
372
|
-
|
|
373
|
-
|
|
371
|
+
table_links: List[DataAccessFunctionDetail] = (
|
|
372
|
+
self.create_data_access_functional_detail(output_variable)
|
|
373
|
+
)
|
|
374
374
|
|
|
375
375
|
# Each item is data-access function
|
|
376
376
|
for f_detail in table_links:
|
|
@@ -390,7 +390,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
390
390
|
|
|
391
391
|
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
392
392
|
# & also pass additional information that will be need to generate lineage
|
|
393
|
-
pattern_handler:
|
|
393
|
+
pattern_handler: AbstractLineage = supported_resolver.handler()(
|
|
394
394
|
ctx=ctx,
|
|
395
395
|
table=self.table,
|
|
396
396
|
config=config,
|
|
@@ -945,9 +945,9 @@ class Mapper:
|
|
|
945
945
|
# Convert tiles to charts
|
|
946
946
|
ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
|
|
947
947
|
# Lets convert dashboard to datahub dashboard
|
|
948
|
-
dashboard_mcps: List[
|
|
949
|
-
|
|
950
|
-
|
|
948
|
+
dashboard_mcps: List[MetadataChangeProposalWrapper] = (
|
|
949
|
+
self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
|
|
950
|
+
)
|
|
951
951
|
|
|
952
952
|
# Now add MCPs in sequence
|
|
953
953
|
mcps.extend(ds_mcps)
|
|
@@ -1472,9 +1472,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1472
1472
|
def _get_dashboard_patch_work_unit(
|
|
1473
1473
|
self, work_unit: MetadataWorkUnit
|
|
1474
1474
|
) -> Optional[MetadataWorkUnit]:
|
|
1475
|
-
dashboard_info_aspect: Optional[
|
|
1476
|
-
DashboardInfoClass
|
|
1477
|
-
|
|
1475
|
+
dashboard_info_aspect: Optional[DashboardInfoClass] = (
|
|
1476
|
+
work_unit.get_aspect_of_type(DashboardInfoClass)
|
|
1477
|
+
)
|
|
1478
1478
|
|
|
1479
1479
|
if dashboard_info_aspect and self.source_config.patch_metadata:
|
|
1480
1480
|
return convert_dashboard_info_to_patch(
|
|
@@ -425,9 +425,9 @@ class DataResolverBase(ABC):
|
|
|
425
425
|
|
|
426
426
|
response.raise_for_status()
|
|
427
427
|
|
|
428
|
-
assert (
|
|
429
|
-
|
|
430
|
-
)
|
|
428
|
+
assert Constant.VALUE in response.json(), (
|
|
429
|
+
"'value' key is not present in paginated response"
|
|
430
|
+
)
|
|
431
431
|
|
|
432
432
|
if not response.json()[Constant.VALUE]: # if it is an empty list then break
|
|
433
433
|
break
|
|
@@ -447,13 +447,13 @@ class DataResolverBase(ABC):
|
|
|
447
447
|
if raw_app is None:
|
|
448
448
|
return None
|
|
449
449
|
|
|
450
|
-
assert (
|
|
451
|
-
Constant.ID in
|
|
452
|
-
)
|
|
450
|
+
assert Constant.ID in raw_app, (
|
|
451
|
+
f"{Constant.ID} is required field not present in server response"
|
|
452
|
+
)
|
|
453
453
|
|
|
454
|
-
assert (
|
|
455
|
-
Constant.NAME in
|
|
456
|
-
)
|
|
454
|
+
assert Constant.NAME in raw_app, (
|
|
455
|
+
f"{Constant.NAME} is required field not present in server response"
|
|
456
|
+
)
|
|
457
457
|
|
|
458
458
|
return App(
|
|
459
459
|
id=raw_app[Constant.ID],
|
|
@@ -156,7 +156,7 @@ class QlikAPI:
|
|
|
156
156
|
)
|
|
157
157
|
if chart:
|
|
158
158
|
if not chart.title:
|
|
159
|
-
chart.title = f"Object {i+1} of Sheet '{sheet.title}'"
|
|
159
|
+
chart.title = f"Object {i + 1} of Sheet '{sheet.title}'"
|
|
160
160
|
sheet.charts.append(chart)
|
|
161
161
|
websocket_connection.handle.pop()
|
|
162
162
|
return sheet
|
|
@@ -178,9 +178,9 @@ class RedshiftConfig(
|
|
|
178
178
|
@root_validator(pre=True)
|
|
179
179
|
def check_email_is_set_on_usage(cls, values):
|
|
180
180
|
if values.get("include_usage_statistics"):
|
|
181
|
-
assert (
|
|
182
|
-
"email_domain
|
|
183
|
-
)
|
|
181
|
+
assert "email_domain" in values and values["email_domain"], (
|
|
182
|
+
"email_domain needs to be set if usage is enabled"
|
|
183
|
+
)
|
|
184
184
|
return values
|
|
185
185
|
|
|
186
186
|
@root_validator(skip_on_failure=True)
|