acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/METADATA +2513 -2521
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/RECORD +168 -168
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +4 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/gc/datahub_gc.py +1 -0
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +17 -5
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +0 -1
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +245 -101
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc8.dist-info}/top_level.txt +0 -0
|
@@ -425,9 +425,9 @@ class DataResolverBase(ABC):
|
|
|
425
425
|
|
|
426
426
|
response.raise_for_status()
|
|
427
427
|
|
|
428
|
-
assert (
|
|
429
|
-
|
|
430
|
-
)
|
|
428
|
+
assert Constant.VALUE in response.json(), (
|
|
429
|
+
"'value' key is not present in paginated response"
|
|
430
|
+
)
|
|
431
431
|
|
|
432
432
|
if not response.json()[Constant.VALUE]: # if it is an empty list then break
|
|
433
433
|
break
|
|
@@ -447,13 +447,13 @@ class DataResolverBase(ABC):
|
|
|
447
447
|
if raw_app is None:
|
|
448
448
|
return None
|
|
449
449
|
|
|
450
|
-
assert (
|
|
451
|
-
Constant.ID in
|
|
452
|
-
)
|
|
450
|
+
assert Constant.ID in raw_app, (
|
|
451
|
+
f"{Constant.ID} is required field not present in server response"
|
|
452
|
+
)
|
|
453
453
|
|
|
454
|
-
assert (
|
|
455
|
-
Constant.NAME in
|
|
456
|
-
)
|
|
454
|
+
assert Constant.NAME in raw_app, (
|
|
455
|
+
f"{Constant.NAME} is required field not present in server response"
|
|
456
|
+
)
|
|
457
457
|
|
|
458
458
|
return App(
|
|
459
459
|
id=raw_app[Constant.ID],
|
|
@@ -96,7 +96,7 @@ class PowerBiAPI:
|
|
|
96
96
|
url: str = e.request.url if e.request else "URL not available"
|
|
97
97
|
self.reporter.warning(
|
|
98
98
|
title="Metadata API Timeout",
|
|
99
|
-
message=
|
|
99
|
+
message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
|
|
100
100
|
context=f"url={url}",
|
|
101
101
|
)
|
|
102
102
|
|
|
@@ -173,7 +173,7 @@ class PowerBiAPI:
|
|
|
173
173
|
entity=entity_name,
|
|
174
174
|
entity_id=entity_id,
|
|
175
175
|
)
|
|
176
|
-
except:
|
|
176
|
+
except Exception:
|
|
177
177
|
e = self.log_http_error(
|
|
178
178
|
message=f"Unable to fetch users for {entity_name}({entity_id})."
|
|
179
179
|
)
|
|
@@ -210,7 +210,7 @@ class PowerBiAPI:
|
|
|
210
210
|
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
|
|
211
211
|
context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
|
|
212
212
|
)
|
|
213
|
-
except:
|
|
213
|
+
except Exception:
|
|
214
214
|
self.log_http_error(
|
|
215
215
|
message=f"Unable to fetch reports for workspace {workspace.name}"
|
|
216
216
|
)
|
|
@@ -260,7 +260,7 @@ class PowerBiAPI:
|
|
|
260
260
|
|
|
261
261
|
groups = self._get_resolver().get_groups(filter_=filter_)
|
|
262
262
|
|
|
263
|
-
except:
|
|
263
|
+
except Exception:
|
|
264
264
|
self.log_http_error(message="Unable to fetch list of workspaces")
|
|
265
265
|
# raise # we want this exception to bubble up
|
|
266
266
|
|
|
@@ -292,7 +292,7 @@ class PowerBiAPI:
|
|
|
292
292
|
modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
|
|
293
293
|
self.__config.modified_since
|
|
294
294
|
)
|
|
295
|
-
except:
|
|
295
|
+
except Exception:
|
|
296
296
|
self.log_http_error(message="Unable to fetch list of modified workspaces.")
|
|
297
297
|
|
|
298
298
|
return modified_workspace_ids
|
|
@@ -303,8 +303,8 @@ class PowerBiAPI:
|
|
|
303
303
|
scan_id = self.__admin_api_resolver.create_scan_job(
|
|
304
304
|
workspace_ids=workspace_ids
|
|
305
305
|
)
|
|
306
|
-
except:
|
|
307
|
-
e = self.log_http_error(message=
|
|
306
|
+
except Exception:
|
|
307
|
+
e = self.log_http_error(message="Unable to fetch get scan result.")
|
|
308
308
|
if data_resolver.is_permission_error(cast(Exception, e)):
|
|
309
309
|
logger.warning(
|
|
310
310
|
"Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "
|
|
@@ -156,7 +156,7 @@ class QlikAPI:
|
|
|
156
156
|
)
|
|
157
157
|
if chart:
|
|
158
158
|
if not chart.title:
|
|
159
|
-
chart.title = f"Object {i+1} of Sheet '{sheet.title}'"
|
|
159
|
+
chart.title = f"Object {i + 1} of Sheet '{sheet.title}'"
|
|
160
160
|
sheet.charts.append(chart)
|
|
161
161
|
websocket_connection.handle.pop()
|
|
162
162
|
return sheet
|
|
@@ -178,9 +178,9 @@ class RedshiftConfig(
|
|
|
178
178
|
@root_validator(pre=True)
|
|
179
179
|
def check_email_is_set_on_usage(cls, values):
|
|
180
180
|
if values.get("include_usage_statistics"):
|
|
181
|
-
assert (
|
|
182
|
-
"email_domain
|
|
183
|
-
)
|
|
181
|
+
assert "email_domain" in values and values["email_domain"], (
|
|
182
|
+
"email_domain needs to be set if usage is enabled"
|
|
183
|
+
)
|
|
184
184
|
return values
|
|
185
185
|
|
|
186
186
|
@root_validator(skip_on_failure=True)
|
|
@@ -305,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
305
305
|
test_report.capability_report = {}
|
|
306
306
|
try:
|
|
307
307
|
RedshiftDataDictionary.get_schemas(connection, database=config.database)
|
|
308
|
-
test_report.capability_report[
|
|
309
|
-
|
|
310
|
-
|
|
308
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
309
|
+
CapabilityReport(capable=True)
|
|
310
|
+
)
|
|
311
311
|
except Exception as e:
|
|
312
|
-
test_report.capability_report[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
313
|
+
CapabilityReport(capable=False, failure_reason=str(e))
|
|
314
|
+
)
|
|
315
315
|
|
|
316
316
|
except Exception as e:
|
|
317
317
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -947,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
947
947
|
def get_all_tables(
|
|
948
948
|
self,
|
|
949
949
|
) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
|
|
950
|
-
all_tables: Dict[
|
|
951
|
-
|
|
952
|
-
|
|
950
|
+
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
|
|
951
|
+
defaultdict(dict)
|
|
952
|
+
)
|
|
953
953
|
for db in set().union(self.db_tables, self.db_views):
|
|
954
954
|
tables = self.db_tables.get(db, {})
|
|
955
955
|
views = self.db_views.get(db, {})
|
|
@@ -967,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
967
967
|
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
968
968
|
) -> Iterable[MetadataWorkUnit]:
|
|
969
969
|
with PerfTimer() as timer:
|
|
970
|
-
redundant_usage_run_skip_handler: Optional[
|
|
971
|
-
|
|
972
|
-
|
|
970
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
971
|
+
None
|
|
972
|
+
)
|
|
973
973
|
if self.config.enable_stateful_usage_ingestion:
|
|
974
974
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
975
975
|
source=self,
|
|
@@ -199,10 +199,10 @@ class RedshiftUsageExtractor:
|
|
|
199
199
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
200
200
|
database=self.config.database,
|
|
201
201
|
)
|
|
202
|
-
access_events_iterable: Iterable[
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
202
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
203
|
+
self._gen_access_events_from_history_query(
|
|
204
|
+
query, connection=self.connection, all_tables=all_tables
|
|
205
|
+
)
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
|
|
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
|
|
|
225
225
|
start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
226
226
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
227
227
|
)
|
|
228
|
-
access_events_iterable: Iterable[
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
229
|
+
self._gen_access_events_from_history_query(
|
|
230
|
+
query, connection, all_tables=all_tables
|
|
231
|
+
)
|
|
232
232
|
)
|
|
233
233
|
|
|
234
234
|
# Generate operation aspect work units from the access events
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -1124,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1124
1124
|
table_data.table_path
|
|
1125
1125
|
].timestamp = table_data.timestamp
|
|
1126
1126
|
|
|
1127
|
-
for
|
|
1127
|
+
for _, table_data in table_dict.items():
|
|
1128
1128
|
yield from self.ingest_table(table_data, path_spec)
|
|
1129
1129
|
|
|
1130
1130
|
if not self.source_config.is_profiling_enabled():
|
|
@@ -236,12 +236,12 @@ class SalesforceSource(Source):
|
|
|
236
236
|
try:
|
|
237
237
|
if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
|
|
238
238
|
logger.debug("Access Token Provided in Config")
|
|
239
|
-
assert (
|
|
240
|
-
|
|
241
|
-
)
|
|
242
|
-
assert (
|
|
243
|
-
|
|
244
|
-
)
|
|
239
|
+
assert self.config.access_token is not None, (
|
|
240
|
+
"Config access_token is required for DIRECT_ACCESS_TOKEN auth"
|
|
241
|
+
)
|
|
242
|
+
assert self.config.instance_url is not None, (
|
|
243
|
+
"Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
|
|
244
|
+
)
|
|
245
245
|
|
|
246
246
|
self.sf = Salesforce(
|
|
247
247
|
instance_url=self.config.instance_url,
|
|
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
|
|
|
250
250
|
)
|
|
251
251
|
elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
|
|
252
252
|
logger.debug("Username/Password Provided in Config")
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
256
|
-
assert (
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
assert (
|
|
260
|
-
|
|
261
|
-
)
|
|
253
|
+
assert self.config.username is not None, (
|
|
254
|
+
"Config username is required for USERNAME_PASSWORD auth"
|
|
255
|
+
)
|
|
256
|
+
assert self.config.password is not None, (
|
|
257
|
+
"Config password is required for USERNAME_PASSWORD auth"
|
|
258
|
+
)
|
|
259
|
+
assert self.config.security_token is not None, (
|
|
260
|
+
"Config security_token is required for USERNAME_PASSWORD auth"
|
|
261
|
+
)
|
|
262
262
|
|
|
263
263
|
self.sf = Salesforce(
|
|
264
264
|
username=self.config.username,
|
|
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
|
|
|
269
269
|
|
|
270
270
|
elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
|
|
271
271
|
logger.debug("Json Web Token provided in the config")
|
|
272
|
-
assert (
|
|
273
|
-
|
|
274
|
-
)
|
|
275
|
-
assert (
|
|
276
|
-
|
|
277
|
-
)
|
|
278
|
-
assert (
|
|
279
|
-
|
|
280
|
-
)
|
|
272
|
+
assert self.config.username is not None, (
|
|
273
|
+
"Config username is required for JSON_WEB_TOKEN auth"
|
|
274
|
+
)
|
|
275
|
+
assert self.config.consumer_key is not None, (
|
|
276
|
+
"Config consumer_key is required for JSON_WEB_TOKEN auth"
|
|
277
|
+
)
|
|
278
|
+
assert self.config.private_key is not None, (
|
|
279
|
+
"Config private_key is required for JSON_WEB_TOKEN auth"
|
|
280
|
+
)
|
|
281
281
|
|
|
282
282
|
self.sf = Salesforce(
|
|
283
283
|
username=self.config.username,
|
|
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
|
|
|
439
439
|
dataPlatformInstance = DataPlatformInstanceClass(
|
|
440
440
|
builder.make_data_platform_urn(self.platform),
|
|
441
441
|
instance=builder.make_dataplatform_instance_urn(
|
|
442
|
-
self.platform,
|
|
442
|
+
self.platform,
|
|
443
|
+
self.config.platform_instance, # type:ignore
|
|
443
444
|
),
|
|
444
445
|
)
|
|
445
446
|
|
|
@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
354
354
|
browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
|
|
355
355
|
|
|
356
356
|
if os.path.isdir(self.config.path):
|
|
357
|
-
for root,
|
|
357
|
+
for root, _, files in os.walk(self.config.path, topdown=False):
|
|
358
358
|
for file_name in [f for f in files if f.endswith(".json")]:
|
|
359
359
|
try:
|
|
360
360
|
yield from self._load_one_file(
|
|
@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
477
477
|
upstream_dataset_urns
|
|
478
478
|
and dataset_urn not in self.dataset_upstream_urn_mapping
|
|
479
479
|
):
|
|
480
|
-
self.dataset_upstream_urn_mapping[
|
|
481
|
-
|
|
482
|
-
|
|
480
|
+
self.dataset_upstream_urn_mapping[dataset_urn] = (
|
|
481
|
+
upstream_dataset_urns
|
|
482
|
+
)
|
|
483
483
|
|
|
484
484
|
element_input_fields = [
|
|
485
485
|
InputFieldClass(
|
|
@@ -126,9 +126,9 @@ class SigmaAPI:
|
|
|
126
126
|
response.raise_for_status()
|
|
127
127
|
response_dict = response.json()
|
|
128
128
|
for workspace_dict in response_dict[Constant.ENTRIES]:
|
|
129
|
-
self.workspaces[
|
|
130
|
-
workspace_dict
|
|
131
|
-
|
|
129
|
+
self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
|
|
130
|
+
Workspace.parse_obj(workspace_dict)
|
|
131
|
+
)
|
|
132
132
|
if response_dict[Constant.NEXTPAGE]:
|
|
133
133
|
url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
134
134
|
else:
|
|
@@ -147,9 +147,9 @@ class SigmaAPI:
|
|
|
147
147
|
response.raise_for_status()
|
|
148
148
|
response_dict = response.json()
|
|
149
149
|
for user_dict in response_dict[Constant.ENTRIES]:
|
|
150
|
-
users[
|
|
151
|
-
user_dict[Constant.
|
|
152
|
-
|
|
150
|
+
users[user_dict[Constant.MEMBERID]] = (
|
|
151
|
+
f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
|
|
152
|
+
)
|
|
153
153
|
if response_dict[Constant.NEXTPAGE]:
|
|
154
154
|
url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
155
155
|
else:
|
|
@@ -327,10 +327,12 @@ class SigmaAPI:
|
|
|
327
327
|
response.raise_for_status()
|
|
328
328
|
for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
|
|
329
329
|
if not element_dict.get(Constant.NAME):
|
|
330
|
-
element_dict[Constant.NAME] =
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
element_dict[Constant.NAME] = (
|
|
331
|
+
f"Element {i + 1} of Page '{page.name}'"
|
|
332
|
+
)
|
|
333
|
+
element_dict[Constant.URL] = (
|
|
334
|
+
f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
|
|
335
|
+
)
|
|
334
336
|
element = Element.parse_obj(element_dict)
|
|
335
337
|
if (
|
|
336
338
|
self.config.extract_lineage
|
|
@@ -384,18 +384,20 @@ class SnowflakeV2Config(
|
|
|
384
384
|
assert all(
|
|
385
385
|
consumer.platform_instance != share_details.platform_instance
|
|
386
386
|
for consumer in share_details.consumers
|
|
387
|
-
),
|
|
387
|
+
), (
|
|
388
|
+
"Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
|
|
389
|
+
)
|
|
388
390
|
|
|
389
391
|
databases_included_in_share.append(shared_db)
|
|
390
392
|
databases_created_from_share.extend(share_details.consumers)
|
|
391
393
|
|
|
392
394
|
for db_from_share in databases_created_from_share:
|
|
393
|
-
assert (
|
|
394
|
-
|
|
395
|
-
)
|
|
396
|
-
assert (
|
|
397
|
-
|
|
398
|
-
)
|
|
395
|
+
assert db_from_share not in databases_included_in_share, (
|
|
396
|
+
"Database included in a share can not be present as consumer in any share."
|
|
397
|
+
)
|
|
398
|
+
assert databases_created_from_share.count(db_from_share) == 1, (
|
|
399
|
+
"Same database can not be present as consumer in more than one share."
|
|
400
|
+
)
|
|
399
401
|
|
|
400
402
|
return shares
|
|
401
403
|
|
|
@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
250
250
|
if self.private_key is not None:
|
|
251
251
|
pkey_bytes = self.private_key.replace("\\n", "\n").encode()
|
|
252
252
|
else:
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
253
|
+
assert self.private_key_path, (
|
|
254
|
+
"missing required private key path to read key from"
|
|
255
|
+
)
|
|
256
256
|
with open(self.private_key_path, "rb") as key:
|
|
257
257
|
pkey_bytes = key.read()
|
|
258
258
|
|
|
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
284
284
|
return self.options
|
|
285
285
|
|
|
286
286
|
def get_oauth_connection(self) -> NativeSnowflakeConnection:
|
|
287
|
-
assert (
|
|
288
|
-
|
|
289
|
-
)
|
|
287
|
+
assert self.oauth_config, (
|
|
288
|
+
"oauth_config should be provided if using oauth based authentication"
|
|
289
|
+
)
|
|
290
290
|
generator = OAuthTokenGenerator(
|
|
291
291
|
client_id=self.oauth_config.client_id,
|
|
292
292
|
authority_url=self.oauth_config.authority_url,
|
|
@@ -623,7 +623,7 @@ fingerprinted_queries as (
|
|
|
623
623
|
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
624
624
|
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
625
625
|
AND execution_status = 'SUCCESS'
|
|
626
|
-
AND {users_filter or
|
|
626
|
+
AND {users_filter or "TRUE"}
|
|
627
627
|
)
|
|
628
628
|
, deduplicated_queries as (
|
|
629
629
|
SELECT
|
|
@@ -651,7 +651,7 @@ fingerprinted_queries as (
|
|
|
651
651
|
WHERE
|
|
652
652
|
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
653
653
|
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
654
|
-
AND {users_filter or
|
|
654
|
+
AND {users_filter or "TRUE"}
|
|
655
655
|
AND query_id IN (
|
|
656
656
|
SELECT query_id FROM deduplicated_queries
|
|
657
657
|
)
|
|
@@ -142,9 +142,9 @@ class _SnowflakeTagCache:
|
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
# self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
|
|
145
|
-
self._table_tags: Dict[
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
|
|
146
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
147
|
+
)
|
|
148
148
|
|
|
149
149
|
# self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
|
|
150
150
|
self._column_tags: Dict[
|
|
@@ -194,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
194
194
|
config, self.data_dictionary, self.report
|
|
195
195
|
)
|
|
196
196
|
self.profiler: Optional[SnowflakeProfiler] = profiler
|
|
197
|
-
self.snowsight_url_builder: Optional[
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
|
|
198
|
+
snowsight_url_builder
|
|
199
|
+
)
|
|
200
200
|
|
|
201
201
|
# These are populated as side-effects of get_workunits_internal.
|
|
202
202
|
self.databases: List[SnowflakeDatabase] = []
|
|
@@ -267,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
267
267
|
)
|
|
268
268
|
return None
|
|
269
269
|
else:
|
|
270
|
-
ischema_databases: List[
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
ischema_databases: List[SnowflakeDatabase] = (
|
|
271
|
+
self.get_databases_from_ischema(databases)
|
|
272
|
+
)
|
|
273
273
|
|
|
274
274
|
if len(ischema_databases) == 0:
|
|
275
275
|
self.structured_reporter.failure(
|
|
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
38
38
|
table_name: Optional[str],
|
|
39
39
|
) -> List[SnowflakeTag]:
|
|
40
40
|
if db_name not in self.tag_cache:
|
|
41
|
-
self.tag_cache[
|
|
42
|
-
db_name
|
|
43
|
-
|
|
41
|
+
self.tag_cache[db_name] = (
|
|
42
|
+
self.data_dictionary.get_tags_for_database_without_propagation(db_name)
|
|
43
|
+
)
|
|
44
44
|
|
|
45
45
|
if domain == SnowflakeObjectDomain.DATABASE:
|
|
46
46
|
return self.tag_cache[db_name].get_database_tags(db_name)
|
|
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
130
130
|
temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
|
|
131
131
|
if self.config.extract_tags == TagOption.without_lineage:
|
|
132
132
|
if db_name not in self.tag_cache:
|
|
133
|
-
self.tag_cache[
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
133
|
+
self.tag_cache[db_name] = (
|
|
134
|
+
self.data_dictionary.get_tags_for_database_without_propagation(
|
|
135
|
+
db_name
|
|
136
|
+
)
|
|
137
137
|
)
|
|
138
138
|
temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
|
|
139
139
|
table_name, schema_name, db_name
|
|
@@ -549,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
549
549
|
):
|
|
550
550
|
# NOTE: Generated emails may be incorrect, as email may be different than
|
|
551
551
|
# username@email_domain
|
|
552
|
-
event_dict[
|
|
553
|
-
"
|
|
554
|
-
|
|
552
|
+
event_dict["EMAIL"] = (
|
|
553
|
+
f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
|
|
554
|
+
)
|
|
555
555
|
|
|
556
556
|
if not event_dict["EMAIL"]:
|
|
557
557
|
self.report.rows_missing_email += 1
|
|
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
|
|
|
21
21
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
22
22
|
@property
|
|
23
23
|
@abc.abstractmethod
|
|
24
|
-
def structured_reporter(self) -> SourceReport:
|
|
25
|
-
...
|
|
24
|
+
def structured_reporter(self) -> SourceReport: ...
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
class SnowsightUrlBuilder:
|
|
@@ -211,9 +211,9 @@ class SnowflakeV2Source(
|
|
|
211
211
|
|
|
212
212
|
self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
|
|
213
213
|
if self.config.include_usage_stats or self.config.include_operational_stats:
|
|
214
|
-
redundant_usage_run_skip_handler: Optional[
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
215
|
+
None
|
|
216
|
+
)
|
|
217
217
|
if self.config.enable_stateful_usage_ingestion:
|
|
218
218
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
219
219
|
source=self,
|
|
@@ -296,7 +296,16 @@ class SnowflakeV2Source(
|
|
|
296
296
|
|
|
297
297
|
_report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
|
|
298
298
|
privileges: List[SnowflakePrivilege] = []
|
|
299
|
-
capabilities: List[SourceCapability] = [
|
|
299
|
+
capabilities: List[SourceCapability] = [
|
|
300
|
+
c.capability
|
|
301
|
+
for c in SnowflakeV2Source.get_capabilities() # type: ignore
|
|
302
|
+
if c.capability
|
|
303
|
+
not in (
|
|
304
|
+
SourceCapability.PLATFORM_INSTANCE,
|
|
305
|
+
SourceCapability.DOMAINS,
|
|
306
|
+
SourceCapability.DELETION_DETECTION,
|
|
307
|
+
)
|
|
308
|
+
]
|
|
300
309
|
|
|
301
310
|
cur = conn.query("select current_role()")
|
|
302
311
|
current_role = [row["CURRENT_ROLE()"] for row in cur][0]
|
|
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
104
104
|
return "\n".join([r for r in res])
|
|
105
105
|
|
|
106
106
|
@typing.no_type_check
|
|
107
|
-
def _get_column_type(
|
|
108
|
-
self, type_: Union[str, Dict[str, Any]]
|
|
109
|
-
) -> TypeEngine: # noqa: C901
|
|
107
|
+
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
|
|
110
108
|
"""Derives the data type of the Athena column.
|
|
111
109
|
|
|
112
110
|
This method is overwritten to extend the behavior of PyAthena.
|