acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -484,11 +484,11 @@ class NativeDerivedViewUpstream(AbstractViewUpstream):
|
|
|
484
484
|
)
|
|
485
485
|
|
|
486
486
|
def __get_upstream_dataset_urn(self) -> List[str]:
|
|
487
|
-
current_view_id: Optional[
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
487
|
+
current_view_id: Optional[LookerViewId] = (
|
|
488
|
+
self.looker_view_id_cache.get_looker_view_id(
|
|
489
|
+
view_name=self.view_context.name(),
|
|
490
|
+
base_folder_path=self.view_context.base_folder_path,
|
|
491
|
+
)
|
|
492
492
|
)
|
|
493
493
|
|
|
494
494
|
# Current view will always be present in cache. assert will silence the lint
|
|
@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
23
23
|
platform_name,
|
|
24
24
|
support_status,
|
|
25
25
|
)
|
|
26
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor,
|
|
26
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
27
27
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
28
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
29
29
|
StaleEntityRemovalHandler,
|
|
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
789
789
|
|
|
790
790
|
return platform, dbname, schema, platform_instance
|
|
791
791
|
|
|
792
|
-
@classmethod
|
|
793
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
794
|
-
config = MetabaseConfig.parse_obj(config_dict)
|
|
795
|
-
return cls(ctx, config)
|
|
796
|
-
|
|
797
792
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
798
793
|
return [
|
|
799
794
|
*super().get_workunit_processors(),
|
|
@@ -172,10 +172,10 @@ class MLflowSource(Source):
|
|
|
172
172
|
"""
|
|
173
173
|
Get all Registered Models in MLflow Model Registry.
|
|
174
174
|
"""
|
|
175
|
-
registered_models: Iterable[
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
175
|
+
registered_models: Iterable[RegisteredModel] = (
|
|
176
|
+
self._traverse_mlflow_search_func(
|
|
177
|
+
search_func=self.client.search_registered_models,
|
|
178
|
+
)
|
|
179
179
|
)
|
|
180
180
|
return registered_models
|
|
181
181
|
|
|
@@ -333,8 +333,3 @@ class MLflowSource(Source):
|
|
|
333
333
|
aspect=global_tags,
|
|
334
334
|
)
|
|
335
335
|
return wu
|
|
336
|
-
|
|
337
|
-
@classmethod
|
|
338
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
339
|
-
config = MLflowConfig.parse_obj(config_dict)
|
|
340
|
-
return cls(ctx, config)
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -893,11 +893,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
893
893
|
jinja_params[key] = parameters[key].get("default", "")
|
|
894
894
|
|
|
895
895
|
normalized_query = re.sub(
|
|
896
|
-
r"{% form %}(.*){% endform %}",
|
|
897
|
-
"",
|
|
898
|
-
query,
|
|
899
|
-
0,
|
|
900
|
-
re.MULTILINE | re.DOTALL,
|
|
896
|
+
pattern=r"{% form %}(.*){% endform %}",
|
|
897
|
+
repl="",
|
|
898
|
+
string=query,
|
|
899
|
+
count=0,
|
|
900
|
+
flags=re.MULTILINE | re.DOTALL,
|
|
901
901
|
)
|
|
902
902
|
|
|
903
903
|
# Wherever we don't resolve the jinja params, we replace it with NULL
|
|
@@ -288,7 +288,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
288
288
|
|
|
289
289
|
# See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes
|
|
290
290
|
self.mongo_client = MongoClient(
|
|
291
|
-
self.config.connect_uri,
|
|
291
|
+
self.config.connect_uri,
|
|
292
|
+
datetime_conversion="DATETIME_AUTO",
|
|
293
|
+
**options, # type: ignore
|
|
292
294
|
)
|
|
293
295
|
|
|
294
296
|
# This cheaply tests the connection. For details, see
|
|
@@ -470,9 +472,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
470
472
|
)
|
|
471
473
|
# Add this information to the custom properties so user can know they are looking at downsampled schema
|
|
472
474
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
473
|
-
dataset_properties.customProperties[
|
|
474
|
-
"
|
|
475
|
-
|
|
475
|
+
dataset_properties.customProperties["schema.totalFields"] = (
|
|
476
|
+
f"{collection_schema_size}"
|
|
477
|
+
)
|
|
476
478
|
|
|
477
479
|
logger.debug(f"Size of collection fields = {len(collection_fields)}")
|
|
478
480
|
# append each schema field (sort so output is consistent)
|
|
@@ -286,7 +286,7 @@ class Neo4jSource(Source):
|
|
|
286
286
|
df = self.get_neo4j_metadata(
|
|
287
287
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
288
|
)
|
|
289
|
-
for
|
|
289
|
+
for _, row in df.iterrows():
|
|
290
290
|
try:
|
|
291
291
|
yield MetadataWorkUnit(
|
|
292
292
|
id=row["key"],
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -184,9 +184,9 @@ class NifiSourceConfig(EnvConfigMixin):
|
|
|
184
184
|
|
|
185
185
|
@validator("site_url")
|
|
186
186
|
def validator_site_url(cls, site_url: str) -> str:
|
|
187
|
-
assert site_url.startswith(
|
|
188
|
-
|
|
189
|
-
)
|
|
187
|
+
assert site_url.startswith(("http://", "https://")), (
|
|
188
|
+
"site_url must start with http:// or https://"
|
|
189
|
+
)
|
|
190
190
|
|
|
191
191
|
if not site_url.endswith("/"):
|
|
192
192
|
site_url = site_url + "/"
|
|
@@ -484,17 +484,10 @@ class NifiSource(Source):
|
|
|
484
484
|
def rest_api_base_url(self):
|
|
485
485
|
return self.config.site_url[: -len("nifi/")] + "nifi-api/"
|
|
486
486
|
|
|
487
|
-
@classmethod
|
|
488
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
|
489
|
-
config = NifiSourceConfig.parse_obj(config_dict)
|
|
490
|
-
return cls(config, ctx)
|
|
491
|
-
|
|
492
487
|
def get_report(self) -> SourceReport:
|
|
493
488
|
return self.report
|
|
494
489
|
|
|
495
|
-
def update_flow(
|
|
496
|
-
self, pg_flow_dto: Dict, recursion_level: int = 0
|
|
497
|
-
) -> None: # noqa: C901
|
|
490
|
+
def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901
|
|
498
491
|
"""
|
|
499
492
|
Update self.nifi_flow with contents of the input process group `pg_flow_dto`
|
|
500
493
|
"""
|
|
@@ -553,16 +546,16 @@ class NifiSource(Source):
|
|
|
553
546
|
for inputPort in flow_dto.get("inputPorts", []):
|
|
554
547
|
component = inputPort.get("component")
|
|
555
548
|
if inputPort.get("allowRemoteAccess"):
|
|
556
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
549
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
550
|
+
NifiComponent(
|
|
551
|
+
component.get("id"),
|
|
552
|
+
component.get("name"),
|
|
553
|
+
component.get("type"),
|
|
554
|
+
component.get("parentGroupId"),
|
|
555
|
+
NifiType.INPUT_PORT,
|
|
556
|
+
comments=component.get("comments"),
|
|
557
|
+
status=component.get("status", {}).get("runStatus"),
|
|
558
|
+
)
|
|
566
559
|
)
|
|
567
560
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
568
561
|
else:
|
|
@@ -581,16 +574,16 @@ class NifiSource(Source):
|
|
|
581
574
|
for outputPort in flow_dto.get("outputPorts", []):
|
|
582
575
|
component = outputPort.get("component")
|
|
583
576
|
if outputPort.get("allowRemoteAccess"):
|
|
584
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
577
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
578
|
+
NifiComponent(
|
|
579
|
+
component.get("id"),
|
|
580
|
+
component.get("name"),
|
|
581
|
+
component.get("type"),
|
|
582
|
+
component.get("parentGroupId"),
|
|
583
|
+
NifiType.OUTPUT_PORT,
|
|
584
|
+
comments=component.get("comments"),
|
|
585
|
+
status=component.get("status", {}).get("runStatus"),
|
|
586
|
+
)
|
|
594
587
|
)
|
|
595
588
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
596
589
|
else:
|
|
@@ -101,16 +101,16 @@ class OpenApiConfig(ConfigModel):
|
|
|
101
101
|
# details there once, and then use that session for all requests.
|
|
102
102
|
self.token = f"Bearer {self.bearer_token}"
|
|
103
103
|
else:
|
|
104
|
-
assert (
|
|
105
|
-
"url_complement
|
|
106
|
-
)
|
|
104
|
+
assert "url_complement" in self.get_token.keys(), (
|
|
105
|
+
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
106
|
+
)
|
|
107
107
|
if self.get_token["request_type"] == "get":
|
|
108
|
-
assert (
|
|
109
|
-
"{username}
|
|
110
|
-
)
|
|
111
|
-
assert (
|
|
112
|
-
"{password}
|
|
113
|
-
)
|
|
108
|
+
assert "{username}" in self.get_token["url_complement"], (
|
|
109
|
+
"we expect the keyword {username} to be present in the url"
|
|
110
|
+
)
|
|
111
|
+
assert "{password}" in self.get_token["url_complement"], (
|
|
112
|
+
"we expect the keyword {password} to be present in the url"
|
|
113
|
+
)
|
|
114
114
|
url4req = self.get_token["url_complement"].replace(
|
|
115
115
|
"{username}", self.username
|
|
116
116
|
)
|
|
@@ -225,9 +225,9 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
225
225
|
def default_for_dataset_type_mapping() -> Dict[str, str]:
|
|
226
226
|
dict_: dict = {}
|
|
227
227
|
for item in SupportedDataPlatform:
|
|
228
|
-
dict_[
|
|
229
|
-
item.value.
|
|
230
|
-
|
|
228
|
+
dict_[item.value.powerbi_data_platform_name] = (
|
|
229
|
+
item.value.datahub_data_platform_name
|
|
230
|
+
)
|
|
231
231
|
|
|
232
232
|
return dict_
|
|
233
233
|
|
|
@@ -303,15 +303,15 @@ class PowerBiDashboardSourceConfig(
|
|
|
303
303
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
304
304
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
305
305
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
306
|
-
dataset_type_mapping: Union[
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
306
|
+
dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
|
|
307
|
+
pydantic.Field(
|
|
308
|
+
default_factory=default_for_dataset_type_mapping,
|
|
309
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
310
|
+
"DataHub supported datasources."
|
|
311
|
+
"You can configured platform instance for dataset lineage. "
|
|
312
|
+
"See Quickstart Recipe for mapping",
|
|
313
|
+
hidden_from_docs=True,
|
|
314
|
+
)
|
|
315
315
|
)
|
|
316
316
|
# PowerBI datasource's server to platform instance mapping
|
|
317
317
|
server_to_platform_instance: Dict[
|
|
@@ -128,17 +128,17 @@ def get_upstream_tables(
|
|
|
128
128
|
reporter.m_query_parse_successes += 1
|
|
129
129
|
|
|
130
130
|
try:
|
|
131
|
-
lineage: List[
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
131
|
+
lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = (
|
|
132
|
+
resolver.MQueryResolver(
|
|
133
|
+
table=table,
|
|
134
|
+
parse_tree=parse_tree,
|
|
135
|
+
reporter=reporter,
|
|
136
|
+
parameters=parameters,
|
|
137
|
+
).resolve_to_lineage(
|
|
138
|
+
ctx=ctx,
|
|
139
|
+
config=config,
|
|
140
|
+
platform_instance_resolver=platform_instance_resolver,
|
|
141
|
+
)
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
if lineage:
|
|
@@ -170,8 +170,7 @@ class AbstractLineage(ABC):
|
|
|
170
170
|
logger.debug(f"Processing arguments {arguments}")
|
|
171
171
|
|
|
172
172
|
if (
|
|
173
|
-
len(arguments)
|
|
174
|
-
>= 4 # [0] is warehouse FQDN.
|
|
173
|
+
len(arguments) >= 4 # [0] is warehouse FQDN.
|
|
175
174
|
# [1] is endpoint, we are not using it.
|
|
176
175
|
# [2] is "Catalog" key
|
|
177
176
|
# [3] is catalog's value
|
|
@@ -215,16 +214,16 @@ class AbstractLineage(ABC):
|
|
|
215
214
|
native_sql_parser.remove_special_characters(query)
|
|
216
215
|
)
|
|
217
216
|
|
|
218
|
-
parsed_result: Optional[
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
217
|
+
parsed_result: Optional["SqlParsingResult"] = (
|
|
218
|
+
native_sql_parser.parse_custom_sql(
|
|
219
|
+
ctx=self.ctx,
|
|
220
|
+
query=query,
|
|
221
|
+
platform=self.get_platform_pair().datahub_data_platform_name,
|
|
222
|
+
platform_instance=platform_detail.platform_instance,
|
|
223
|
+
env=platform_detail.env,
|
|
224
|
+
database=database,
|
|
225
|
+
schema=schema,
|
|
226
|
+
)
|
|
228
227
|
)
|
|
229
228
|
|
|
230
229
|
if parsed_result is None:
|
|
@@ -410,9 +409,9 @@ class DatabricksLineage(AbstractLineage):
|
|
|
410
409
|
f"Processing Databrick data-access function detail {data_access_func_detail}"
|
|
411
410
|
)
|
|
412
411
|
table_detail: Dict[str, str] = {}
|
|
413
|
-
temp_accessor: Optional[
|
|
414
|
-
|
|
415
|
-
|
|
412
|
+
temp_accessor: Optional[IdentifierAccessor] = (
|
|
413
|
+
data_access_func_detail.identifier_accessor
|
|
414
|
+
)
|
|
416
415
|
|
|
417
416
|
while temp_accessor:
|
|
418
417
|
# Condition to handle databricks M-query pattern where table, schema and database all are present in
|
|
@@ -647,11 +646,13 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
647
646
|
db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
|
|
648
647
|
# Second is schema name
|
|
649
648
|
schema_name: str = cast(
|
|
650
|
-
IdentifierAccessor,
|
|
649
|
+
IdentifierAccessor,
|
|
650
|
+
data_access_func_detail.identifier_accessor.next, # type: ignore
|
|
651
651
|
).items["Name"]
|
|
652
652
|
# Third is table name
|
|
653
653
|
table_name: str = cast(
|
|
654
|
-
IdentifierAccessor,
|
|
654
|
+
IdentifierAccessor,
|
|
655
|
+
data_access_func_detail.identifier_accessor.next.next, # type: ignore
|
|
655
656
|
).items["Name"]
|
|
656
657
|
|
|
657
658
|
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
@@ -768,10 +769,13 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
768
769
|
): # database name is explicitly set
|
|
769
770
|
return database
|
|
770
771
|
|
|
771
|
-
return
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
772
|
+
return (
|
|
773
|
+
get_next_item( # database name is set in Name argument
|
|
774
|
+
data_access_tokens, "Name"
|
|
775
|
+
)
|
|
776
|
+
or get_next_item( # If both above arguments are not available, then try Catalog
|
|
777
|
+
data_access_tokens, "Catalog"
|
|
778
|
+
)
|
|
775
779
|
)
|
|
776
780
|
|
|
777
781
|
def create_lineage(
|
|
@@ -819,9 +823,7 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
819
823
|
values=tree_function.remove_whitespaces_from_list(
|
|
820
824
|
tree_function.token_values(flat_argument_list[1])
|
|
821
825
|
),
|
|
822
|
-
)[
|
|
823
|
-
0
|
|
824
|
-
] # Remove any whitespaces and double quotes character
|
|
826
|
+
)[0] # Remove any whitespaces and double quotes character
|
|
825
827
|
|
|
826
828
|
server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
|
|
827
829
|
|
|
@@ -188,9 +188,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
188
188
|
# - The inner function Table.TransformColumnTypes takes #"Removed Columns1"
|
|
189
189
|
# (a table reference) as its first argument
|
|
190
190
|
# - Its result is then passed as the first argument to Table.SplitColumn
|
|
191
|
-
second_invoke_expression: Optional[
|
|
192
|
-
|
|
193
|
-
|
|
191
|
+
second_invoke_expression: Optional[Tree] = (
|
|
192
|
+
tree_function.first_invoke_expression_func(first_argument)
|
|
193
|
+
)
|
|
194
194
|
if second_invoke_expression:
|
|
195
195
|
# 1. The First argument is function call
|
|
196
196
|
# 2. That function's first argument references next table variable
|
|
@@ -304,14 +304,14 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
304
304
|
logger.debug(v_statement.pretty())
|
|
305
305
|
return None
|
|
306
306
|
|
|
307
|
-
invoke_expression: Optional[
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
invoke_expression: Optional[Tree] = (
|
|
308
|
+
tree_function.first_invoke_expression_func(rh_tree)
|
|
309
|
+
)
|
|
310
310
|
|
|
311
311
|
if invoke_expression is not None:
|
|
312
|
-
result: Union[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
result: Union[DataAccessFunctionDetail, List[str], None] = (
|
|
313
|
+
self._process_invoke_expression(invoke_expression)
|
|
314
|
+
)
|
|
315
315
|
if result is None:
|
|
316
316
|
return None # No need to process some un-expected grammar found while processing invoke_expression
|
|
317
317
|
if isinstance(result, DataAccessFunctionDetail):
|
|
@@ -368,9 +368,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
368
368
|
return lineage
|
|
369
369
|
|
|
370
370
|
# Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
|
|
371
|
-
table_links: List[
|
|
372
|
-
|
|
373
|
-
|
|
371
|
+
table_links: List[DataAccessFunctionDetail] = (
|
|
372
|
+
self.create_data_access_functional_detail(output_variable)
|
|
373
|
+
)
|
|
374
374
|
|
|
375
375
|
# Each item is data-access function
|
|
376
376
|
for f_detail in table_links:
|
|
@@ -390,7 +390,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
390
390
|
|
|
391
391
|
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
392
392
|
# & also pass additional information that will be need to generate lineage
|
|
393
|
-
pattern_handler:
|
|
393
|
+
pattern_handler: AbstractLineage = supported_resolver.handler()(
|
|
394
394
|
ctx=ctx,
|
|
395
395
|
table=self.table,
|
|
396
396
|
config=config,
|
|
@@ -945,9 +945,9 @@ class Mapper:
|
|
|
945
945
|
# Convert tiles to charts
|
|
946
946
|
ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
|
|
947
947
|
# Lets convert dashboard to datahub dashboard
|
|
948
|
-
dashboard_mcps: List[
|
|
949
|
-
|
|
950
|
-
|
|
948
|
+
dashboard_mcps: List[MetadataChangeProposalWrapper] = (
|
|
949
|
+
self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
|
|
950
|
+
)
|
|
951
951
|
|
|
952
952
|
# Now add MCPs in sequence
|
|
953
953
|
mcps.extend(ds_mcps)
|
|
@@ -1472,9 +1472,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1472
1472
|
def _get_dashboard_patch_work_unit(
|
|
1473
1473
|
self, work_unit: MetadataWorkUnit
|
|
1474
1474
|
) -> Optional[MetadataWorkUnit]:
|
|
1475
|
-
dashboard_info_aspect: Optional[
|
|
1476
|
-
DashboardInfoClass
|
|
1477
|
-
|
|
1475
|
+
dashboard_info_aspect: Optional[DashboardInfoClass] = (
|
|
1476
|
+
work_unit.get_aspect_of_type(DashboardInfoClass)
|
|
1477
|
+
)
|
|
1478
1478
|
|
|
1479
1479
|
if dashboard_info_aspect and self.source_config.patch_metadata:
|
|
1480
1480
|
return convert_dashboard_info_to_patch(
|
|
@@ -425,9 +425,9 @@ class DataResolverBase(ABC):
|
|
|
425
425
|
|
|
426
426
|
response.raise_for_status()
|
|
427
427
|
|
|
428
|
-
assert (
|
|
429
|
-
|
|
430
|
-
)
|
|
428
|
+
assert Constant.VALUE in response.json(), (
|
|
429
|
+
"'value' key is not present in paginated response"
|
|
430
|
+
)
|
|
431
431
|
|
|
432
432
|
if not response.json()[Constant.VALUE]: # if it is an empty list then break
|
|
433
433
|
break
|
|
@@ -447,13 +447,13 @@ class DataResolverBase(ABC):
|
|
|
447
447
|
if raw_app is None:
|
|
448
448
|
return None
|
|
449
449
|
|
|
450
|
-
assert (
|
|
451
|
-
Constant.ID in
|
|
452
|
-
)
|
|
450
|
+
assert Constant.ID in raw_app, (
|
|
451
|
+
f"{Constant.ID} is required field not present in server response"
|
|
452
|
+
)
|
|
453
453
|
|
|
454
|
-
assert (
|
|
455
|
-
Constant.NAME in
|
|
456
|
-
)
|
|
454
|
+
assert Constant.NAME in raw_app, (
|
|
455
|
+
f"{Constant.NAME} is required field not present in server response"
|
|
456
|
+
)
|
|
457
457
|
|
|
458
458
|
return App(
|
|
459
459
|
id=raw_app[Constant.ID],
|
|
@@ -96,7 +96,7 @@ class PowerBiAPI:
|
|
|
96
96
|
url: str = e.request.url if e.request else "URL not available"
|
|
97
97
|
self.reporter.warning(
|
|
98
98
|
title="Metadata API Timeout",
|
|
99
|
-
message=
|
|
99
|
+
message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
|
|
100
100
|
context=f"url={url}",
|
|
101
101
|
)
|
|
102
102
|
|
|
@@ -173,7 +173,7 @@ class PowerBiAPI:
|
|
|
173
173
|
entity=entity_name,
|
|
174
174
|
entity_id=entity_id,
|
|
175
175
|
)
|
|
176
|
-
except:
|
|
176
|
+
except Exception:
|
|
177
177
|
e = self.log_http_error(
|
|
178
178
|
message=f"Unable to fetch users for {entity_name}({entity_id})."
|
|
179
179
|
)
|
|
@@ -210,7 +210,7 @@ class PowerBiAPI:
|
|
|
210
210
|
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
|
|
211
211
|
context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
|
|
212
212
|
)
|
|
213
|
-
except:
|
|
213
|
+
except Exception:
|
|
214
214
|
self.log_http_error(
|
|
215
215
|
message=f"Unable to fetch reports for workspace {workspace.name}"
|
|
216
216
|
)
|
|
@@ -260,7 +260,7 @@ class PowerBiAPI:
|
|
|
260
260
|
|
|
261
261
|
groups = self._get_resolver().get_groups(filter_=filter_)
|
|
262
262
|
|
|
263
|
-
except:
|
|
263
|
+
except Exception:
|
|
264
264
|
self.log_http_error(message="Unable to fetch list of workspaces")
|
|
265
265
|
# raise # we want this exception to bubble up
|
|
266
266
|
|
|
@@ -292,7 +292,7 @@ class PowerBiAPI:
|
|
|
292
292
|
modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
|
|
293
293
|
self.__config.modified_since
|
|
294
294
|
)
|
|
295
|
-
except:
|
|
295
|
+
except Exception:
|
|
296
296
|
self.log_http_error(message="Unable to fetch list of modified workspaces.")
|
|
297
297
|
|
|
298
298
|
return modified_workspace_ids
|
|
@@ -303,8 +303,8 @@ class PowerBiAPI:
|
|
|
303
303
|
scan_id = self.__admin_api_resolver.create_scan_job(
|
|
304
304
|
workspace_ids=workspace_ids
|
|
305
305
|
)
|
|
306
|
-
except:
|
|
307
|
-
e = self.log_http_error(message=
|
|
306
|
+
except Exception:
|
|
307
|
+
e = self.log_http_error(message="Unable to fetch get scan result.")
|
|
308
308
|
if data_resolver.is_permission_error(cast(Exception, e)):
|
|
309
309
|
logger.warning(
|
|
310
310
|
"Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "
|
|
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
485
485
|
self.filtered_reports.append(view)
|
|
486
486
|
|
|
487
487
|
|
|
488
|
-
@platform_name("PowerBI")
|
|
488
|
+
@platform_name("PowerBI Report Server")
|
|
489
489
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
490
490
|
@support_status(SupportStatus.INCUBATING)
|
|
491
491
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
@@ -156,7 +156,7 @@ class QlikAPI:
|
|
|
156
156
|
)
|
|
157
157
|
if chart:
|
|
158
158
|
if not chart.title:
|
|
159
|
-
chart.title = f"Object {i+1} of Sheet '{sheet.title}'"
|
|
159
|
+
chart.title = f"Object {i + 1} of Sheet '{sheet.title}'"
|
|
160
160
|
sheet.charts.append(chart)
|
|
161
161
|
websocket_connection.handle.pop()
|
|
162
162
|
return sheet
|
|
@@ -369,11 +369,6 @@ class RedashSource(Source):
|
|
|
369
369
|
else:
|
|
370
370
|
raise ValueError(f"Failed to connect to {self.config.connect_uri}/api")
|
|
371
371
|
|
|
372
|
-
@classmethod
|
|
373
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
374
|
-
config = RedashConfig.parse_obj(config_dict)
|
|
375
|
-
return cls(ctx, config)
|
|
376
|
-
|
|
377
372
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
|
378
373
|
url = f"/api/data_sources/{data_source_id}"
|
|
379
374
|
resp = self.client._get(url).json()
|
|
@@ -178,9 +178,9 @@ class RedshiftConfig(
|
|
|
178
178
|
@root_validator(pre=True)
|
|
179
179
|
def check_email_is_set_on_usage(cls, values):
|
|
180
180
|
if values.get("include_usage_statistics"):
|
|
181
|
-
assert (
|
|
182
|
-
"email_domain
|
|
183
|
-
)
|
|
181
|
+
assert "email_domain" in values and values["email_domain"], (
|
|
182
|
+
"email_domain needs to be set if usage is enabled"
|
|
183
|
+
)
|
|
184
184
|
return values
|
|
185
185
|
|
|
186
186
|
@root_validator(skip_on_failure=True)
|