acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2335 -2337
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +157 -157
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -288,7 +288,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
288
288
|
|
|
289
289
|
# See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes
|
|
290
290
|
self.mongo_client = MongoClient(
|
|
291
|
-
self.config.connect_uri,
|
|
291
|
+
self.config.connect_uri,
|
|
292
|
+
datetime_conversion="DATETIME_AUTO",
|
|
293
|
+
**options, # type: ignore
|
|
292
294
|
)
|
|
293
295
|
|
|
294
296
|
# This cheaply tests the connection. For details, see
|
|
@@ -470,9 +472,9 @@ class MongoDBSource(StatefulIngestionSourceBase):
|
|
|
470
472
|
)
|
|
471
473
|
# Add this information to the custom properties so user can know they are looking at downsampled schema
|
|
472
474
|
dataset_properties.customProperties["schema.downsampled"] = "True"
|
|
473
|
-
dataset_properties.customProperties[
|
|
474
|
-
"
|
|
475
|
-
|
|
475
|
+
dataset_properties.customProperties["schema.totalFields"] = (
|
|
476
|
+
f"{collection_schema_size}"
|
|
477
|
+
)
|
|
476
478
|
|
|
477
479
|
logger.debug(f"Size of collection fields = {len(collection_fields)}")
|
|
478
480
|
# append each schema field (sort so output is consistent)
|
|
@@ -286,7 +286,7 @@ class Neo4jSource(Source):
|
|
|
286
286
|
df = self.get_neo4j_metadata(
|
|
287
287
|
"CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
|
|
288
288
|
)
|
|
289
|
-
for
|
|
289
|
+
for _, row in df.iterrows():
|
|
290
290
|
try:
|
|
291
291
|
yield MetadataWorkUnit(
|
|
292
292
|
id=row["key"],
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -184,9 +184,9 @@ class NifiSourceConfig(EnvConfigMixin):
|
|
|
184
184
|
|
|
185
185
|
@validator("site_url")
|
|
186
186
|
def validator_site_url(cls, site_url: str) -> str:
|
|
187
|
-
assert site_url.startswith(
|
|
188
|
-
|
|
189
|
-
)
|
|
187
|
+
assert site_url.startswith(("http://", "https://")), (
|
|
188
|
+
"site_url must start with http:// or https://"
|
|
189
|
+
)
|
|
190
190
|
|
|
191
191
|
if not site_url.endswith("/"):
|
|
192
192
|
site_url = site_url + "/"
|
|
@@ -487,9 +487,7 @@ class NifiSource(Source):
|
|
|
487
487
|
def get_report(self) -> SourceReport:
|
|
488
488
|
return self.report
|
|
489
489
|
|
|
490
|
-
def update_flow(
|
|
491
|
-
self, pg_flow_dto: Dict, recursion_level: int = 0
|
|
492
|
-
) -> None: # noqa: C901
|
|
490
|
+
def update_flow(self, pg_flow_dto: Dict, recursion_level: int = 0) -> None: # noqa: C901
|
|
493
491
|
"""
|
|
494
492
|
Update self.nifi_flow with contents of the input process group `pg_flow_dto`
|
|
495
493
|
"""
|
|
@@ -548,16 +546,16 @@ class NifiSource(Source):
|
|
|
548
546
|
for inputPort in flow_dto.get("inputPorts", []):
|
|
549
547
|
component = inputPort.get("component")
|
|
550
548
|
if inputPort.get("allowRemoteAccess"):
|
|
551
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
549
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
550
|
+
NifiComponent(
|
|
551
|
+
component.get("id"),
|
|
552
|
+
component.get("name"),
|
|
553
|
+
component.get("type"),
|
|
554
|
+
component.get("parentGroupId"),
|
|
555
|
+
NifiType.INPUT_PORT,
|
|
556
|
+
comments=component.get("comments"),
|
|
557
|
+
status=component.get("status", {}).get("runStatus"),
|
|
558
|
+
)
|
|
561
559
|
)
|
|
562
560
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
563
561
|
else:
|
|
@@ -576,16 +574,16 @@ class NifiSource(Source):
|
|
|
576
574
|
for outputPort in flow_dto.get("outputPorts", []):
|
|
577
575
|
component = outputPort.get("component")
|
|
578
576
|
if outputPort.get("allowRemoteAccess"):
|
|
579
|
-
self.nifi_flow.remotely_accessible_ports[
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
577
|
+
self.nifi_flow.remotely_accessible_ports[component.get("id")] = (
|
|
578
|
+
NifiComponent(
|
|
579
|
+
component.get("id"),
|
|
580
|
+
component.get("name"),
|
|
581
|
+
component.get("type"),
|
|
582
|
+
component.get("parentGroupId"),
|
|
583
|
+
NifiType.OUTPUT_PORT,
|
|
584
|
+
comments=component.get("comments"),
|
|
585
|
+
status=component.get("status", {}).get("runStatus"),
|
|
586
|
+
)
|
|
589
587
|
)
|
|
590
588
|
logger.debug(f"Adding remotely accessible port {component.get('id')}")
|
|
591
589
|
else:
|
|
@@ -101,16 +101,16 @@ class OpenApiConfig(ConfigModel):
|
|
|
101
101
|
# details there once, and then use that session for all requests.
|
|
102
102
|
self.token = f"Bearer {self.bearer_token}"
|
|
103
103
|
else:
|
|
104
|
-
assert (
|
|
105
|
-
"url_complement
|
|
106
|
-
)
|
|
104
|
+
assert "url_complement" in self.get_token.keys(), (
|
|
105
|
+
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
106
|
+
)
|
|
107
107
|
if self.get_token["request_type"] == "get":
|
|
108
|
-
assert (
|
|
109
|
-
"{username}
|
|
110
|
-
)
|
|
111
|
-
assert (
|
|
112
|
-
"{password}
|
|
113
|
-
)
|
|
108
|
+
assert "{username}" in self.get_token["url_complement"], (
|
|
109
|
+
"we expect the keyword {username} to be present in the url"
|
|
110
|
+
)
|
|
111
|
+
assert "{password}" in self.get_token["url_complement"], (
|
|
112
|
+
"we expect the keyword {password} to be present in the url"
|
|
113
|
+
)
|
|
114
114
|
url4req = self.get_token["url_complement"].replace(
|
|
115
115
|
"{username}", self.username
|
|
116
116
|
)
|
|
@@ -225,9 +225,9 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
225
225
|
def default_for_dataset_type_mapping() -> Dict[str, str]:
|
|
226
226
|
dict_: dict = {}
|
|
227
227
|
for item in SupportedDataPlatform:
|
|
228
|
-
dict_[
|
|
229
|
-
item.value.
|
|
230
|
-
|
|
228
|
+
dict_[item.value.powerbi_data_platform_name] = (
|
|
229
|
+
item.value.datahub_data_platform_name
|
|
230
|
+
)
|
|
231
231
|
|
|
232
232
|
return dict_
|
|
233
233
|
|
|
@@ -303,15 +303,15 @@ class PowerBiDashboardSourceConfig(
|
|
|
303
303
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
304
304
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
305
305
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
306
|
-
dataset_type_mapping: Union[
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
306
|
+
dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
|
|
307
|
+
pydantic.Field(
|
|
308
|
+
default_factory=default_for_dataset_type_mapping,
|
|
309
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
310
|
+
"DataHub supported datasources."
|
|
311
|
+
"You can configured platform instance for dataset lineage. "
|
|
312
|
+
"See Quickstart Recipe for mapping",
|
|
313
|
+
hidden_from_docs=True,
|
|
314
|
+
)
|
|
315
315
|
)
|
|
316
316
|
# PowerBI datasource's server to platform instance mapping
|
|
317
317
|
server_to_platform_instance: Dict[
|
|
@@ -128,17 +128,17 @@ def get_upstream_tables(
|
|
|
128
128
|
reporter.m_query_parse_successes += 1
|
|
129
129
|
|
|
130
130
|
try:
|
|
131
|
-
lineage: List[
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
131
|
+
lineage: List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage] = (
|
|
132
|
+
resolver.MQueryResolver(
|
|
133
|
+
table=table,
|
|
134
|
+
parse_tree=parse_tree,
|
|
135
|
+
reporter=reporter,
|
|
136
|
+
parameters=parameters,
|
|
137
|
+
).resolve_to_lineage(
|
|
138
|
+
ctx=ctx,
|
|
139
|
+
config=config,
|
|
140
|
+
platform_instance_resolver=platform_instance_resolver,
|
|
141
|
+
)
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
if lineage:
|
|
@@ -170,8 +170,7 @@ class AbstractLineage(ABC):
|
|
|
170
170
|
logger.debug(f"Processing arguments {arguments}")
|
|
171
171
|
|
|
172
172
|
if (
|
|
173
|
-
len(arguments)
|
|
174
|
-
>= 4 # [0] is warehouse FQDN.
|
|
173
|
+
len(arguments) >= 4 # [0] is warehouse FQDN.
|
|
175
174
|
# [1] is endpoint, we are not using it.
|
|
176
175
|
# [2] is "Catalog" key
|
|
177
176
|
# [3] is catalog's value
|
|
@@ -215,16 +214,16 @@ class AbstractLineage(ABC):
|
|
|
215
214
|
native_sql_parser.remove_special_characters(query)
|
|
216
215
|
)
|
|
217
216
|
|
|
218
|
-
parsed_result: Optional[
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
217
|
+
parsed_result: Optional["SqlParsingResult"] = (
|
|
218
|
+
native_sql_parser.parse_custom_sql(
|
|
219
|
+
ctx=self.ctx,
|
|
220
|
+
query=query,
|
|
221
|
+
platform=self.get_platform_pair().datahub_data_platform_name,
|
|
222
|
+
platform_instance=platform_detail.platform_instance,
|
|
223
|
+
env=platform_detail.env,
|
|
224
|
+
database=database,
|
|
225
|
+
schema=schema,
|
|
226
|
+
)
|
|
228
227
|
)
|
|
229
228
|
|
|
230
229
|
if parsed_result is None:
|
|
@@ -410,9 +409,9 @@ class DatabricksLineage(AbstractLineage):
|
|
|
410
409
|
f"Processing Databrick data-access function detail {data_access_func_detail}"
|
|
411
410
|
)
|
|
412
411
|
table_detail: Dict[str, str] = {}
|
|
413
|
-
temp_accessor: Optional[
|
|
414
|
-
|
|
415
|
-
|
|
412
|
+
temp_accessor: Optional[IdentifierAccessor] = (
|
|
413
|
+
data_access_func_detail.identifier_accessor
|
|
414
|
+
)
|
|
416
415
|
|
|
417
416
|
while temp_accessor:
|
|
418
417
|
# Condition to handle databricks M-query pattern where table, schema and database all are present in
|
|
@@ -647,11 +646,13 @@ class ThreeStepDataAccessPattern(AbstractLineage, ABC):
|
|
|
647
646
|
db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore
|
|
648
647
|
# Second is schema name
|
|
649
648
|
schema_name: str = cast(
|
|
650
|
-
IdentifierAccessor,
|
|
649
|
+
IdentifierAccessor,
|
|
650
|
+
data_access_func_detail.identifier_accessor.next, # type: ignore
|
|
651
651
|
).items["Name"]
|
|
652
652
|
# Third is table name
|
|
653
653
|
table_name: str = cast(
|
|
654
|
-
IdentifierAccessor,
|
|
654
|
+
IdentifierAccessor,
|
|
655
|
+
data_access_func_detail.identifier_accessor.next.next, # type: ignore
|
|
655
656
|
).items["Name"]
|
|
656
657
|
|
|
657
658
|
qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
|
|
@@ -768,10 +769,13 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
768
769
|
): # database name is explicitly set
|
|
769
770
|
return database
|
|
770
771
|
|
|
771
|
-
return
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
772
|
+
return (
|
|
773
|
+
get_next_item( # database name is set in Name argument
|
|
774
|
+
data_access_tokens, "Name"
|
|
775
|
+
)
|
|
776
|
+
or get_next_item( # If both above arguments are not available, then try Catalog
|
|
777
|
+
data_access_tokens, "Catalog"
|
|
778
|
+
)
|
|
775
779
|
)
|
|
776
780
|
|
|
777
781
|
def create_lineage(
|
|
@@ -819,9 +823,7 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
819
823
|
values=tree_function.remove_whitespaces_from_list(
|
|
820
824
|
tree_function.token_values(flat_argument_list[1])
|
|
821
825
|
),
|
|
822
|
-
)[
|
|
823
|
-
0
|
|
824
|
-
] # Remove any whitespaces and double quotes character
|
|
826
|
+
)[0] # Remove any whitespaces and double quotes character
|
|
825
827
|
|
|
826
828
|
server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
|
|
827
829
|
|
|
@@ -188,9 +188,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
188
188
|
# - The inner function Table.TransformColumnTypes takes #"Removed Columns1"
|
|
189
189
|
# (a table reference) as its first argument
|
|
190
190
|
# - Its result is then passed as the first argument to Table.SplitColumn
|
|
191
|
-
second_invoke_expression: Optional[
|
|
192
|
-
|
|
193
|
-
|
|
191
|
+
second_invoke_expression: Optional[Tree] = (
|
|
192
|
+
tree_function.first_invoke_expression_func(first_argument)
|
|
193
|
+
)
|
|
194
194
|
if second_invoke_expression:
|
|
195
195
|
# 1. The First argument is function call
|
|
196
196
|
# 2. That function's first argument references next table variable
|
|
@@ -304,14 +304,14 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
304
304
|
logger.debug(v_statement.pretty())
|
|
305
305
|
return None
|
|
306
306
|
|
|
307
|
-
invoke_expression: Optional[
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
invoke_expression: Optional[Tree] = (
|
|
308
|
+
tree_function.first_invoke_expression_func(rh_tree)
|
|
309
|
+
)
|
|
310
310
|
|
|
311
311
|
if invoke_expression is not None:
|
|
312
|
-
result: Union[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
result: Union[DataAccessFunctionDetail, List[str], None] = (
|
|
313
|
+
self._process_invoke_expression(invoke_expression)
|
|
314
|
+
)
|
|
315
315
|
if result is None:
|
|
316
316
|
return None # No need to process some un-expected grammar found while processing invoke_expression
|
|
317
317
|
if isinstance(result, DataAccessFunctionDetail):
|
|
@@ -368,9 +368,9 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
368
368
|
return lineage
|
|
369
369
|
|
|
370
370
|
# Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
|
|
371
|
-
table_links: List[
|
|
372
|
-
|
|
373
|
-
|
|
371
|
+
table_links: List[DataAccessFunctionDetail] = (
|
|
372
|
+
self.create_data_access_functional_detail(output_variable)
|
|
373
|
+
)
|
|
374
374
|
|
|
375
375
|
# Each item is data-access function
|
|
376
376
|
for f_detail in table_links:
|
|
@@ -390,7 +390,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
|
|
|
390
390
|
|
|
391
391
|
# From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
|
|
392
392
|
# & also pass additional information that will be need to generate lineage
|
|
393
|
-
pattern_handler:
|
|
393
|
+
pattern_handler: AbstractLineage = supported_resolver.handler()(
|
|
394
394
|
ctx=ctx,
|
|
395
395
|
table=self.table,
|
|
396
396
|
config=config,
|
|
@@ -945,9 +945,9 @@ class Mapper:
|
|
|
945
945
|
# Convert tiles to charts
|
|
946
946
|
ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
|
|
947
947
|
# Lets convert dashboard to datahub dashboard
|
|
948
|
-
dashboard_mcps: List[
|
|
949
|
-
|
|
950
|
-
|
|
948
|
+
dashboard_mcps: List[MetadataChangeProposalWrapper] = (
|
|
949
|
+
self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
|
|
950
|
+
)
|
|
951
951
|
|
|
952
952
|
# Now add MCPs in sequence
|
|
953
953
|
mcps.extend(ds_mcps)
|
|
@@ -1472,9 +1472,9 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1472
1472
|
def _get_dashboard_patch_work_unit(
|
|
1473
1473
|
self, work_unit: MetadataWorkUnit
|
|
1474
1474
|
) -> Optional[MetadataWorkUnit]:
|
|
1475
|
-
dashboard_info_aspect: Optional[
|
|
1476
|
-
DashboardInfoClass
|
|
1477
|
-
|
|
1475
|
+
dashboard_info_aspect: Optional[DashboardInfoClass] = (
|
|
1476
|
+
work_unit.get_aspect_of_type(DashboardInfoClass)
|
|
1477
|
+
)
|
|
1478
1478
|
|
|
1479
1479
|
if dashboard_info_aspect and self.source_config.patch_metadata:
|
|
1480
1480
|
return convert_dashboard_info_to_patch(
|
|
@@ -425,9 +425,9 @@ class DataResolverBase(ABC):
|
|
|
425
425
|
|
|
426
426
|
response.raise_for_status()
|
|
427
427
|
|
|
428
|
-
assert (
|
|
429
|
-
|
|
430
|
-
)
|
|
428
|
+
assert Constant.VALUE in response.json(), (
|
|
429
|
+
"'value' key is not present in paginated response"
|
|
430
|
+
)
|
|
431
431
|
|
|
432
432
|
if not response.json()[Constant.VALUE]: # if it is an empty list then break
|
|
433
433
|
break
|
|
@@ -447,13 +447,13 @@ class DataResolverBase(ABC):
|
|
|
447
447
|
if raw_app is None:
|
|
448
448
|
return None
|
|
449
449
|
|
|
450
|
-
assert (
|
|
451
|
-
Constant.ID in
|
|
452
|
-
)
|
|
450
|
+
assert Constant.ID in raw_app, (
|
|
451
|
+
f"{Constant.ID} is required field not present in server response"
|
|
452
|
+
)
|
|
453
453
|
|
|
454
|
-
assert (
|
|
455
|
-
Constant.NAME in
|
|
456
|
-
)
|
|
454
|
+
assert Constant.NAME in raw_app, (
|
|
455
|
+
f"{Constant.NAME} is required field not present in server response"
|
|
456
|
+
)
|
|
457
457
|
|
|
458
458
|
return App(
|
|
459
459
|
id=raw_app[Constant.ID],
|
|
@@ -156,7 +156,7 @@ class QlikAPI:
|
|
|
156
156
|
)
|
|
157
157
|
if chart:
|
|
158
158
|
if not chart.title:
|
|
159
|
-
chart.title = f"Object {i+1} of Sheet '{sheet.title}'"
|
|
159
|
+
chart.title = f"Object {i + 1} of Sheet '{sheet.title}'"
|
|
160
160
|
sheet.charts.append(chart)
|
|
161
161
|
websocket_connection.handle.pop()
|
|
162
162
|
return sheet
|
|
@@ -178,9 +178,9 @@ class RedshiftConfig(
|
|
|
178
178
|
@root_validator(pre=True)
|
|
179
179
|
def check_email_is_set_on_usage(cls, values):
|
|
180
180
|
if values.get("include_usage_statistics"):
|
|
181
|
-
assert (
|
|
182
|
-
"email_domain
|
|
183
|
-
)
|
|
181
|
+
assert "email_domain" in values and values["email_domain"], (
|
|
182
|
+
"email_domain needs to be set if usage is enabled"
|
|
183
|
+
)
|
|
184
184
|
return values
|
|
185
185
|
|
|
186
186
|
@root_validator(skip_on_failure=True)
|
|
@@ -305,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
305
305
|
test_report.capability_report = {}
|
|
306
306
|
try:
|
|
307
307
|
RedshiftDataDictionary.get_schemas(connection, database=config.database)
|
|
308
|
-
test_report.capability_report[
|
|
309
|
-
|
|
310
|
-
|
|
308
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
309
|
+
CapabilityReport(capable=True)
|
|
310
|
+
)
|
|
311
311
|
except Exception as e:
|
|
312
|
-
test_report.capability_report[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
313
|
+
CapabilityReport(capable=False, failure_reason=str(e))
|
|
314
|
+
)
|
|
315
315
|
|
|
316
316
|
except Exception as e:
|
|
317
317
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -947,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
947
947
|
def get_all_tables(
|
|
948
948
|
self,
|
|
949
949
|
) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
|
|
950
|
-
all_tables: Dict[
|
|
951
|
-
|
|
952
|
-
|
|
950
|
+
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
|
|
951
|
+
defaultdict(dict)
|
|
952
|
+
)
|
|
953
953
|
for db in set().union(self.db_tables, self.db_views):
|
|
954
954
|
tables = self.db_tables.get(db, {})
|
|
955
955
|
views = self.db_views.get(db, {})
|
|
@@ -967,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
967
967
|
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
968
968
|
) -> Iterable[MetadataWorkUnit]:
|
|
969
969
|
with PerfTimer() as timer:
|
|
970
|
-
redundant_usage_run_skip_handler: Optional[
|
|
971
|
-
|
|
972
|
-
|
|
970
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
971
|
+
None
|
|
972
|
+
)
|
|
973
973
|
if self.config.enable_stateful_usage_ingestion:
|
|
974
974
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
975
975
|
source=self,
|
|
@@ -199,10 +199,10 @@ class RedshiftUsageExtractor:
|
|
|
199
199
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
200
200
|
database=self.config.database,
|
|
201
201
|
)
|
|
202
|
-
access_events_iterable: Iterable[
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
202
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
203
|
+
self._gen_access_events_from_history_query(
|
|
204
|
+
query, connection=self.connection, all_tables=all_tables
|
|
205
|
+
)
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
|
|
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
|
|
|
225
225
|
start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
226
226
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
227
227
|
)
|
|
228
|
-
access_events_iterable: Iterable[
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
229
|
+
self._gen_access_events_from_history_query(
|
|
230
|
+
query, connection, all_tables=all_tables
|
|
231
|
+
)
|
|
232
232
|
)
|
|
233
233
|
|
|
234
234
|
# Generate operation aspect work units from the access events
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -1124,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1124
1124
|
table_data.table_path
|
|
1125
1125
|
].timestamp = table_data.timestamp
|
|
1126
1126
|
|
|
1127
|
-
for
|
|
1127
|
+
for _, table_data in table_dict.items():
|
|
1128
1128
|
yield from self.ingest_table(table_data, path_spec)
|
|
1129
1129
|
|
|
1130
1130
|
if not self.source_config.is_profiling_enabled():
|
|
@@ -236,12 +236,12 @@ class SalesforceSource(Source):
|
|
|
236
236
|
try:
|
|
237
237
|
if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
|
|
238
238
|
logger.debug("Access Token Provided in Config")
|
|
239
|
-
assert (
|
|
240
|
-
|
|
241
|
-
)
|
|
242
|
-
assert (
|
|
243
|
-
|
|
244
|
-
)
|
|
239
|
+
assert self.config.access_token is not None, (
|
|
240
|
+
"Config access_token is required for DIRECT_ACCESS_TOKEN auth"
|
|
241
|
+
)
|
|
242
|
+
assert self.config.instance_url is not None, (
|
|
243
|
+
"Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
|
|
244
|
+
)
|
|
245
245
|
|
|
246
246
|
self.sf = Salesforce(
|
|
247
247
|
instance_url=self.config.instance_url,
|
|
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
|
|
|
250
250
|
)
|
|
251
251
|
elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
|
|
252
252
|
logger.debug("Username/Password Provided in Config")
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
256
|
-
assert (
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
assert (
|
|
260
|
-
|
|
261
|
-
)
|
|
253
|
+
assert self.config.username is not None, (
|
|
254
|
+
"Config username is required for USERNAME_PASSWORD auth"
|
|
255
|
+
)
|
|
256
|
+
assert self.config.password is not None, (
|
|
257
|
+
"Config password is required for USERNAME_PASSWORD auth"
|
|
258
|
+
)
|
|
259
|
+
assert self.config.security_token is not None, (
|
|
260
|
+
"Config security_token is required for USERNAME_PASSWORD auth"
|
|
261
|
+
)
|
|
262
262
|
|
|
263
263
|
self.sf = Salesforce(
|
|
264
264
|
username=self.config.username,
|
|
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
|
|
|
269
269
|
|
|
270
270
|
elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
|
|
271
271
|
logger.debug("Json Web Token provided in the config")
|
|
272
|
-
assert (
|
|
273
|
-
|
|
274
|
-
)
|
|
275
|
-
assert (
|
|
276
|
-
|
|
277
|
-
)
|
|
278
|
-
assert (
|
|
279
|
-
|
|
280
|
-
)
|
|
272
|
+
assert self.config.username is not None, (
|
|
273
|
+
"Config username is required for JSON_WEB_TOKEN auth"
|
|
274
|
+
)
|
|
275
|
+
assert self.config.consumer_key is not None, (
|
|
276
|
+
"Config consumer_key is required for JSON_WEB_TOKEN auth"
|
|
277
|
+
)
|
|
278
|
+
assert self.config.private_key is not None, (
|
|
279
|
+
"Config private_key is required for JSON_WEB_TOKEN auth"
|
|
280
|
+
)
|
|
281
281
|
|
|
282
282
|
self.sf = Salesforce(
|
|
283
283
|
username=self.config.username,
|
|
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
|
|
|
439
439
|
dataPlatformInstance = DataPlatformInstanceClass(
|
|
440
440
|
builder.make_data_platform_urn(self.platform),
|
|
441
441
|
instance=builder.make_dataplatform_instance_urn(
|
|
442
|
-
self.platform,
|
|
442
|
+
self.platform,
|
|
443
|
+
self.config.platform_instance, # type:ignore
|
|
443
444
|
),
|
|
444
445
|
)
|
|
445
446
|
|
|
@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
354
354
|
browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
|
|
355
355
|
|
|
356
356
|
if os.path.isdir(self.config.path):
|
|
357
|
-
for root,
|
|
357
|
+
for root, _, files in os.walk(self.config.path, topdown=False):
|
|
358
358
|
for file_name in [f for f in files if f.endswith(".json")]:
|
|
359
359
|
try:
|
|
360
360
|
yield from self._load_one_file(
|