acryl-datahub 0.15.0.2rc7__py3-none-any.whl → 0.15.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/METADATA +2461 -2463
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/RECORD +161 -161
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/cli/cli_utils.py +1 -1
- datahub/cli/delete_cli.py +16 -2
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +3 -3
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +4 -6
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +11 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +6 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +15 -9
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -9
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -4
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/config.py +4 -0
- datahub/ingestion/source/fivetran/fivetran.py +15 -5
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +3 -3
- datahub/ingestion/source/gcs/gcs_source.py +5 -3
- datahub/ingestion/source/ge_data_profiler.py +4 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +3 -3
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +2 -3
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +3 -3
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/mlflow.py +4 -4
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -26
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/query.py +77 -47
- datahub/ingestion/source/redshift/redshift.py +12 -12
- datahub/ingestion/source/redshift/usage.py +8 -8
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +9 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_tag.py +7 -7
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +13 -4
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/teradata.py +16 -3
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/tableau/tableau.py +48 -49
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +3 -3
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +6 -2
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +16 -16
- datahub/sql_parsing/sqlglot_lineage.py +5 -4
- datahub/sql_parsing/sqlglot_utils.py +3 -2
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +10 -10
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc7.dist-info → acryl_datahub-0.15.0.3.dist-info}/top_level.txt +0 -0
|
@@ -797,61 +797,91 @@ class RedshiftServerlessQuery(RedshiftCommonQuery):
|
|
|
797
797
|
db_name: str, start_time: datetime, end_time: datetime
|
|
798
798
|
) -> str:
|
|
799
799
|
return """
|
|
800
|
-
|
|
801
|
-
distinct cluster,
|
|
802
|
-
target_schema,
|
|
803
|
-
target_table,
|
|
804
|
-
username,
|
|
805
|
-
source_schema,
|
|
806
|
-
source_table,
|
|
807
|
-
query_text AS ddl,
|
|
808
|
-
start_time AS timestamp
|
|
809
|
-
FROM
|
|
810
|
-
(
|
|
811
|
-
SELECT
|
|
812
|
-
sti.schema AS target_schema,
|
|
813
|
-
sti.table AS target_table,
|
|
814
|
-
sti.database AS cluster,
|
|
815
|
-
qi.table_id AS target_table_id,
|
|
816
|
-
qi.query_id AS query_id,
|
|
817
|
-
qi.start_time AS start_time
|
|
818
|
-
FROM
|
|
819
|
-
SYS_QUERY_DETAIL qi
|
|
820
|
-
JOIN
|
|
821
|
-
SVV_TABLE_INFO sti on sti.table_id = qi.table_id
|
|
822
|
-
WHERE
|
|
823
|
-
start_time >= '{start_time}' and
|
|
824
|
-
start_time < '{end_time}' and
|
|
825
|
-
cluster = '{db_name}' and
|
|
826
|
-
step_name = 'insert'
|
|
827
|
-
) AS target_tables
|
|
828
|
-
JOIN
|
|
829
|
-
(
|
|
800
|
+
WITH queries AS (
|
|
830
801
|
SELECT
|
|
831
|
-
sti.
|
|
832
|
-
sti.
|
|
833
|
-
|
|
834
|
-
qs.
|
|
835
|
-
|
|
836
|
-
|
|
802
|
+
sti.database as cluster,
|
|
803
|
+
sti.schema AS "schema",
|
|
804
|
+
sti.table AS "table",
|
|
805
|
+
qs.table_id AS table_id,
|
|
806
|
+
qs.query_id as query_id,
|
|
807
|
+
qs.step_name as step_name,
|
|
808
|
+
sui.user_name as username,
|
|
809
|
+
source,
|
|
810
|
+
MIN(qs.start_time) as "timestamp" -- multiple duplicate records with start_time increasing slightly by miliseconds
|
|
837
811
|
FROM
|
|
838
812
|
SYS_QUERY_DETAIL qs
|
|
839
813
|
JOIN
|
|
840
814
|
SVV_TABLE_INFO sti ON sti.table_id = qs.table_id
|
|
841
815
|
LEFT JOIN
|
|
842
|
-
SYS_QUERY_TEXT qt ON qt.query_id = qs.query_id
|
|
843
|
-
LEFT JOIN
|
|
844
816
|
SVV_USER_INFO sui ON qs.user_id = sui.user_id
|
|
845
817
|
WHERE
|
|
846
|
-
|
|
847
|
-
qs.
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
818
|
+
cluster = '{db_name}' AND
|
|
819
|
+
qs.user_id <> 1 AND -- this is user 'rdsdb'
|
|
820
|
+
qs.start_time >= '{start_time}' AND
|
|
821
|
+
qs.start_time < '{end_time}'
|
|
822
|
+
GROUP BY cluster, "schema", "table", qs.table_id, query_id, step_name, username, source -- to be sure we are not making duplicates ourselves the list of group by must match whatever we use in "group by" and "where" of subsequent queries ("cluster" is already set to single value in this query)
|
|
823
|
+
),
|
|
824
|
+
unique_query_text AS (
|
|
825
|
+
SELECT
|
|
826
|
+
query_id,
|
|
827
|
+
sequence,
|
|
828
|
+
text
|
|
829
|
+
FROM (
|
|
830
|
+
SELECT
|
|
831
|
+
query_id,
|
|
832
|
+
"sequence",
|
|
833
|
+
text,
|
|
834
|
+
ROW_NUMBER() OVER (
|
|
835
|
+
PARTITION BY query_id, sequence
|
|
836
|
+
) as rn
|
|
837
|
+
FROM SYS_QUERY_TEXT
|
|
838
|
+
)
|
|
839
|
+
WHERE rn = 1
|
|
840
|
+
),
|
|
841
|
+
scan_queries AS (
|
|
842
|
+
SELECT
|
|
843
|
+
"schema" as source_schema,
|
|
844
|
+
"table" as source_table,
|
|
845
|
+
table_id as source_table_id,
|
|
846
|
+
queries.query_id as query_id,
|
|
847
|
+
username,
|
|
848
|
+
LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
|
|
849
|
+
FROM
|
|
850
|
+
"queries" LEFT JOIN
|
|
851
|
+
unique_query_text qt ON qt.query_id = queries.query_id
|
|
852
|
+
WHERE
|
|
853
|
+
source = 'Redshift(local)' AND
|
|
854
|
+
step_name = 'scan' AND
|
|
855
|
+
qt.sequence < 16 -- truncating query to not exceed Redshift limit on LISTAGG function (each sequence has at most 4k characters, limit is 64k, divided by 4k gives 16, starts count from 0)
|
|
856
|
+
GROUP BY source_schema, source_table, source_table_id, queries.query_id, username
|
|
857
|
+
),
|
|
858
|
+
insert_queries AS (
|
|
859
|
+
SELECT
|
|
860
|
+
"schema" as target_schema,
|
|
861
|
+
"table" as target_table,
|
|
862
|
+
table_id as target_table_id,
|
|
863
|
+
query_id,
|
|
864
|
+
cluster,
|
|
865
|
+
min("timestamp") as "timestamp"
|
|
866
|
+
FROM
|
|
867
|
+
queries
|
|
868
|
+
WHERE
|
|
869
|
+
step_name = 'insert'
|
|
870
|
+
GROUP BY cluster, target_schema, target_table, target_table_id, query_id
|
|
871
|
+
)
|
|
872
|
+
SELECT
|
|
873
|
+
cluster,
|
|
874
|
+
target_schema,
|
|
875
|
+
target_table,
|
|
876
|
+
username,
|
|
877
|
+
source_schema,
|
|
878
|
+
source_table,
|
|
879
|
+
query_text AS ddl,
|
|
880
|
+
"timestamp"
|
|
881
|
+
FROM scan_queries
|
|
882
|
+
JOIN insert_queries on insert_queries.query_id = scan_queries.query_id
|
|
883
|
+
WHERE source_table_id <> target_table_id
|
|
884
|
+
ORDER BY cluster, target_schema, target_table, "timestamp" ASC;
|
|
855
885
|
""".format(
|
|
856
886
|
# We need the original database name for filtering
|
|
857
887
|
db_name=db_name,
|
|
@@ -305,13 +305,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
305
305
|
test_report.capability_report = {}
|
|
306
306
|
try:
|
|
307
307
|
RedshiftDataDictionary.get_schemas(connection, database=config.database)
|
|
308
|
-
test_report.capability_report[
|
|
309
|
-
|
|
310
|
-
|
|
308
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
309
|
+
CapabilityReport(capable=True)
|
|
310
|
+
)
|
|
311
311
|
except Exception as e:
|
|
312
|
-
test_report.capability_report[
|
|
313
|
-
|
|
314
|
-
|
|
312
|
+
test_report.capability_report[SourceCapability.SCHEMA_METADATA] = (
|
|
313
|
+
CapabilityReport(capable=False, failure_reason=str(e))
|
|
314
|
+
)
|
|
315
315
|
|
|
316
316
|
except Exception as e:
|
|
317
317
|
test_report.basic_connectivity = CapabilityReport(
|
|
@@ -947,9 +947,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
947
947
|
def get_all_tables(
|
|
948
948
|
self,
|
|
949
949
|
) -> Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]]:
|
|
950
|
-
all_tables: Dict[
|
|
951
|
-
|
|
952
|
-
|
|
950
|
+
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]] = (
|
|
951
|
+
defaultdict(dict)
|
|
952
|
+
)
|
|
953
953
|
for db in set().union(self.db_tables, self.db_views):
|
|
954
954
|
tables = self.db_tables.get(db, {})
|
|
955
955
|
views = self.db_views.get(db, {})
|
|
@@ -967,9 +967,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
967
967
|
all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
|
|
968
968
|
) -> Iterable[MetadataWorkUnit]:
|
|
969
969
|
with PerfTimer() as timer:
|
|
970
|
-
redundant_usage_run_skip_handler: Optional[
|
|
971
|
-
|
|
972
|
-
|
|
970
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
971
|
+
None
|
|
972
|
+
)
|
|
973
973
|
if self.config.enable_stateful_usage_ingestion:
|
|
974
974
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
975
975
|
source=self,
|
|
@@ -199,10 +199,10 @@ class RedshiftUsageExtractor:
|
|
|
199
199
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
200
200
|
database=self.config.database,
|
|
201
201
|
)
|
|
202
|
-
access_events_iterable: Iterable[
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
202
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
203
|
+
self._gen_access_events_from_history_query(
|
|
204
|
+
query, connection=self.connection, all_tables=all_tables
|
|
205
|
+
)
|
|
206
206
|
)
|
|
207
207
|
|
|
208
208
|
aggregated_events: AggregatedAccessEvents = self._aggregate_access_events(
|
|
@@ -225,10 +225,10 @@ class RedshiftUsageExtractor:
|
|
|
225
225
|
start_time=self.start_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
226
226
|
end_time=self.end_time.strftime(REDSHIFT_DATETIME_FORMAT),
|
|
227
227
|
)
|
|
228
|
-
access_events_iterable: Iterable[
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
access_events_iterable: Iterable[RedshiftAccessEvent] = (
|
|
229
|
+
self._gen_access_events_from_history_query(
|
|
230
|
+
query, connection, all_tables=all_tables
|
|
231
|
+
)
|
|
232
232
|
)
|
|
233
233
|
|
|
234
234
|
# Generate operation aspect work units from the access events
|
|
@@ -85,8 +85,8 @@ class DataLakeProfilerConfig(ConfigModel):
|
|
|
85
85
|
if field_level_metric.startswith("include_field_"):
|
|
86
86
|
values.setdefault(field_level_metric, False)
|
|
87
87
|
|
|
88
|
-
assert (
|
|
89
|
-
|
|
90
|
-
)
|
|
88
|
+
assert max_num_fields_to_profile is None, (
|
|
89
|
+
f"{max_num_fields_to_profile_key} should be set to None"
|
|
90
|
+
)
|
|
91
91
|
|
|
92
92
|
return values
|
|
@@ -1124,7 +1124,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1124
1124
|
table_data.table_path
|
|
1125
1125
|
].timestamp = table_data.timestamp
|
|
1126
1126
|
|
|
1127
|
-
for
|
|
1127
|
+
for _, table_data in table_dict.items():
|
|
1128
1128
|
yield from self.ingest_table(table_data, path_spec)
|
|
1129
1129
|
|
|
1130
1130
|
if not self.source_config.is_profiling_enabled():
|
|
@@ -236,12 +236,12 @@ class SalesforceSource(Source):
|
|
|
236
236
|
try:
|
|
237
237
|
if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN:
|
|
238
238
|
logger.debug("Access Token Provided in Config")
|
|
239
|
-
assert (
|
|
240
|
-
|
|
241
|
-
)
|
|
242
|
-
assert (
|
|
243
|
-
|
|
244
|
-
)
|
|
239
|
+
assert self.config.access_token is not None, (
|
|
240
|
+
"Config access_token is required for DIRECT_ACCESS_TOKEN auth"
|
|
241
|
+
)
|
|
242
|
+
assert self.config.instance_url is not None, (
|
|
243
|
+
"Config instance_url is required for DIRECT_ACCESS_TOKEN auth"
|
|
244
|
+
)
|
|
245
245
|
|
|
246
246
|
self.sf = Salesforce(
|
|
247
247
|
instance_url=self.config.instance_url,
|
|
@@ -250,15 +250,15 @@ class SalesforceSource(Source):
|
|
|
250
250
|
)
|
|
251
251
|
elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD:
|
|
252
252
|
logger.debug("Username/Password Provided in Config")
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
256
|
-
assert (
|
|
257
|
-
|
|
258
|
-
)
|
|
259
|
-
assert (
|
|
260
|
-
|
|
261
|
-
)
|
|
253
|
+
assert self.config.username is not None, (
|
|
254
|
+
"Config username is required for USERNAME_PASSWORD auth"
|
|
255
|
+
)
|
|
256
|
+
assert self.config.password is not None, (
|
|
257
|
+
"Config password is required for USERNAME_PASSWORD auth"
|
|
258
|
+
)
|
|
259
|
+
assert self.config.security_token is not None, (
|
|
260
|
+
"Config security_token is required for USERNAME_PASSWORD auth"
|
|
261
|
+
)
|
|
262
262
|
|
|
263
263
|
self.sf = Salesforce(
|
|
264
264
|
username=self.config.username,
|
|
@@ -269,15 +269,15 @@ class SalesforceSource(Source):
|
|
|
269
269
|
|
|
270
270
|
elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN:
|
|
271
271
|
logger.debug("Json Web Token provided in the config")
|
|
272
|
-
assert (
|
|
273
|
-
|
|
274
|
-
)
|
|
275
|
-
assert (
|
|
276
|
-
|
|
277
|
-
)
|
|
278
|
-
assert (
|
|
279
|
-
|
|
280
|
-
)
|
|
272
|
+
assert self.config.username is not None, (
|
|
273
|
+
"Config username is required for JSON_WEB_TOKEN auth"
|
|
274
|
+
)
|
|
275
|
+
assert self.config.consumer_key is not None, (
|
|
276
|
+
"Config consumer_key is required for JSON_WEB_TOKEN auth"
|
|
277
|
+
)
|
|
278
|
+
assert self.config.private_key is not None, (
|
|
279
|
+
"Config private_key is required for JSON_WEB_TOKEN auth"
|
|
280
|
+
)
|
|
281
281
|
|
|
282
282
|
self.sf = Salesforce(
|
|
283
283
|
username=self.config.username,
|
|
@@ -439,7 +439,8 @@ class SalesforceSource(Source):
|
|
|
439
439
|
dataPlatformInstance = DataPlatformInstanceClass(
|
|
440
440
|
builder.make_data_platform_urn(self.platform),
|
|
441
441
|
instance=builder.make_dataplatform_instance_urn(
|
|
442
|
-
self.platform,
|
|
442
|
+
self.platform,
|
|
443
|
+
self.config.platform_instance, # type:ignore
|
|
443
444
|
),
|
|
444
445
|
)
|
|
445
446
|
|
|
@@ -354,7 +354,7 @@ class JsonSchemaSource(StatefulIngestionSourceBase):
|
|
|
354
354
|
browse_prefix = f"/{self.config.env.lower()}/{self.config.platform}/{self.config.platform_instance}"
|
|
355
355
|
|
|
356
356
|
if os.path.isdir(self.config.path):
|
|
357
|
-
for root,
|
|
357
|
+
for root, _, files in os.walk(self.config.path, topdown=False):
|
|
358
358
|
for file_name in [f for f in files if f.endswith(".json")]:
|
|
359
359
|
try:
|
|
360
360
|
yield from self._load_one_file(
|
|
@@ -477,9 +477,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
477
477
|
upstream_dataset_urns
|
|
478
478
|
and dataset_urn not in self.dataset_upstream_urn_mapping
|
|
479
479
|
):
|
|
480
|
-
self.dataset_upstream_urn_mapping[
|
|
481
|
-
|
|
482
|
-
|
|
480
|
+
self.dataset_upstream_urn_mapping[dataset_urn] = (
|
|
481
|
+
upstream_dataset_urns
|
|
482
|
+
)
|
|
483
483
|
|
|
484
484
|
element_input_fields = [
|
|
485
485
|
InputFieldClass(
|
|
@@ -126,9 +126,9 @@ class SigmaAPI:
|
|
|
126
126
|
response.raise_for_status()
|
|
127
127
|
response_dict = response.json()
|
|
128
128
|
for workspace_dict in response_dict[Constant.ENTRIES]:
|
|
129
|
-
self.workspaces[
|
|
130
|
-
workspace_dict
|
|
131
|
-
|
|
129
|
+
self.workspaces[workspace_dict[Constant.WORKSPACEID]] = (
|
|
130
|
+
Workspace.parse_obj(workspace_dict)
|
|
131
|
+
)
|
|
132
132
|
if response_dict[Constant.NEXTPAGE]:
|
|
133
133
|
url = f"{workspace_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
134
134
|
else:
|
|
@@ -147,9 +147,9 @@ class SigmaAPI:
|
|
|
147
147
|
response.raise_for_status()
|
|
148
148
|
response_dict = response.json()
|
|
149
149
|
for user_dict in response_dict[Constant.ENTRIES]:
|
|
150
|
-
users[
|
|
151
|
-
user_dict[Constant.
|
|
152
|
-
|
|
150
|
+
users[user_dict[Constant.MEMBERID]] = (
|
|
151
|
+
f"{user_dict[Constant.FIRSTNAME]}_{user_dict[Constant.LASTNAME]}"
|
|
152
|
+
)
|
|
153
153
|
if response_dict[Constant.NEXTPAGE]:
|
|
154
154
|
url = f"{members_url}&page={response_dict[Constant.NEXTPAGE]}"
|
|
155
155
|
else:
|
|
@@ -327,10 +327,12 @@ class SigmaAPI:
|
|
|
327
327
|
response.raise_for_status()
|
|
328
328
|
for i, element_dict in enumerate(response.json()[Constant.ENTRIES]):
|
|
329
329
|
if not element_dict.get(Constant.NAME):
|
|
330
|
-
element_dict[Constant.NAME] =
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
element_dict[Constant.NAME] = (
|
|
331
|
+
f"Element {i + 1} of Page '{page.name}'"
|
|
332
|
+
)
|
|
333
|
+
element_dict[Constant.URL] = (
|
|
334
|
+
f"{workbook.url}?:nodeId={element_dict[Constant.ELEMENTID]}&:fullScreen=true"
|
|
335
|
+
)
|
|
334
336
|
element = Element.parse_obj(element_dict)
|
|
335
337
|
if (
|
|
336
338
|
self.config.extract_lineage
|
|
@@ -384,18 +384,20 @@ class SnowflakeV2Config(
|
|
|
384
384
|
assert all(
|
|
385
385
|
consumer.platform_instance != share_details.platform_instance
|
|
386
386
|
for consumer in share_details.consumers
|
|
387
|
-
),
|
|
387
|
+
), (
|
|
388
|
+
"Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake."
|
|
389
|
+
)
|
|
388
390
|
|
|
389
391
|
databases_included_in_share.append(shared_db)
|
|
390
392
|
databases_created_from_share.extend(share_details.consumers)
|
|
391
393
|
|
|
392
394
|
for db_from_share in databases_created_from_share:
|
|
393
|
-
assert (
|
|
394
|
-
|
|
395
|
-
)
|
|
396
|
-
assert (
|
|
397
|
-
|
|
398
|
-
)
|
|
395
|
+
assert db_from_share not in databases_included_in_share, (
|
|
396
|
+
"Database included in a share can not be present as consumer in any share."
|
|
397
|
+
)
|
|
398
|
+
assert databases_created_from_share.count(db_from_share) == 1, (
|
|
399
|
+
"Same database can not be present as consumer in more than one share."
|
|
400
|
+
)
|
|
399
401
|
|
|
400
402
|
return shares
|
|
401
403
|
|
|
@@ -250,9 +250,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
250
250
|
if self.private_key is not None:
|
|
251
251
|
pkey_bytes = self.private_key.replace("\\n", "\n").encode()
|
|
252
252
|
else:
|
|
253
|
-
assert (
|
|
254
|
-
|
|
255
|
-
)
|
|
253
|
+
assert self.private_key_path, (
|
|
254
|
+
"missing required private key path to read key from"
|
|
255
|
+
)
|
|
256
256
|
with open(self.private_key_path, "rb") as key:
|
|
257
257
|
pkey_bytes = key.read()
|
|
258
258
|
|
|
@@ -284,9 +284,9 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
284
284
|
return self.options
|
|
285
285
|
|
|
286
286
|
def get_oauth_connection(self) -> NativeSnowflakeConnection:
|
|
287
|
-
assert (
|
|
288
|
-
|
|
289
|
-
)
|
|
287
|
+
assert self.oauth_config, (
|
|
288
|
+
"oauth_config should be provided if using oauth based authentication"
|
|
289
|
+
)
|
|
290
290
|
generator = OAuthTokenGenerator(
|
|
291
291
|
client_id=self.oauth_config.client_id,
|
|
292
292
|
authority_url=self.oauth_config.authority_url,
|
|
@@ -623,7 +623,7 @@ fingerprinted_queries as (
|
|
|
623
623
|
query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
624
624
|
AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
625
625
|
AND execution_status = 'SUCCESS'
|
|
626
|
-
AND {users_filter or
|
|
626
|
+
AND {users_filter or "TRUE"}
|
|
627
627
|
)
|
|
628
628
|
, deduplicated_queries as (
|
|
629
629
|
SELECT
|
|
@@ -651,7 +651,7 @@ fingerprinted_queries as (
|
|
|
651
651
|
WHERE
|
|
652
652
|
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
|
653
653
|
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
|
654
|
-
AND {users_filter or
|
|
654
|
+
AND {users_filter or "TRUE"}
|
|
655
655
|
AND query_id IN (
|
|
656
656
|
SELECT query_id FROM deduplicated_queries
|
|
657
657
|
)
|
|
@@ -142,9 +142,9 @@ class _SnowflakeTagCache:
|
|
|
142
142
|
)
|
|
143
143
|
|
|
144
144
|
# self._table_tags[<database_name>][<schema_name>][<table_name>] = list of tags applied to table
|
|
145
|
-
self._table_tags: Dict[
|
|
146
|
-
|
|
147
|
-
|
|
145
|
+
self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = (
|
|
146
|
+
defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
|
|
147
|
+
)
|
|
148
148
|
|
|
149
149
|
# self._column_tags[<database_name>][<schema_name>][<table_name>][<column_name>] = list of tags applied to column
|
|
150
150
|
self._column_tags: Dict[
|
|
@@ -194,9 +194,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
194
194
|
config, self.data_dictionary, self.report
|
|
195
195
|
)
|
|
196
196
|
self.profiler: Optional[SnowflakeProfiler] = profiler
|
|
197
|
-
self.snowsight_url_builder: Optional[
|
|
198
|
-
|
|
199
|
-
|
|
197
|
+
self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = (
|
|
198
|
+
snowsight_url_builder
|
|
199
|
+
)
|
|
200
200
|
|
|
201
201
|
# These are populated as side-effects of get_workunits_internal.
|
|
202
202
|
self.databases: List[SnowflakeDatabase] = []
|
|
@@ -267,9 +267,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
267
267
|
)
|
|
268
268
|
return None
|
|
269
269
|
else:
|
|
270
|
-
ischema_databases: List[
|
|
271
|
-
|
|
272
|
-
|
|
270
|
+
ischema_databases: List[SnowflakeDatabase] = (
|
|
271
|
+
self.get_databases_from_ischema(databases)
|
|
272
|
+
)
|
|
273
273
|
|
|
274
274
|
if len(ischema_databases) == 0:
|
|
275
275
|
self.structured_reporter.failure(
|
|
@@ -38,9 +38,9 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
38
38
|
table_name: Optional[str],
|
|
39
39
|
) -> List[SnowflakeTag]:
|
|
40
40
|
if db_name not in self.tag_cache:
|
|
41
|
-
self.tag_cache[
|
|
42
|
-
db_name
|
|
43
|
-
|
|
41
|
+
self.tag_cache[db_name] = (
|
|
42
|
+
self.data_dictionary.get_tags_for_database_without_propagation(db_name)
|
|
43
|
+
)
|
|
44
44
|
|
|
45
45
|
if domain == SnowflakeObjectDomain.DATABASE:
|
|
46
46
|
return self.tag_cache[db_name].get_database_tags(db_name)
|
|
@@ -130,10 +130,10 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
130
130
|
temp_column_tags: Dict[str, List[SnowflakeTag]] = {}
|
|
131
131
|
if self.config.extract_tags == TagOption.without_lineage:
|
|
132
132
|
if db_name not in self.tag_cache:
|
|
133
|
-
self.tag_cache[
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
133
|
+
self.tag_cache[db_name] = (
|
|
134
|
+
self.data_dictionary.get_tags_for_database_without_propagation(
|
|
135
|
+
db_name
|
|
136
|
+
)
|
|
137
137
|
)
|
|
138
138
|
temp_column_tags = self.tag_cache[db_name].get_column_tags_for_table(
|
|
139
139
|
table_name, schema_name, db_name
|
|
@@ -549,9 +549,9 @@ class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
549
549
|
):
|
|
550
550
|
# NOTE: Generated emails may be incorrect, as email may be different than
|
|
551
551
|
# username@email_domain
|
|
552
|
-
event_dict[
|
|
553
|
-
"
|
|
554
|
-
|
|
552
|
+
event_dict["EMAIL"] = (
|
|
553
|
+
f"{event_dict['USER_NAME']}@{self.config.email_domain}".lower()
|
|
554
|
+
)
|
|
555
555
|
|
|
556
556
|
if not event_dict["EMAIL"]:
|
|
557
557
|
self.report.rows_missing_email += 1
|
|
@@ -21,8 +21,7 @@ from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Repor
|
|
|
21
21
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
22
22
|
@property
|
|
23
23
|
@abc.abstractmethod
|
|
24
|
-
def structured_reporter(self) -> SourceReport:
|
|
25
|
-
...
|
|
24
|
+
def structured_reporter(self) -> SourceReport: ...
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
class SnowsightUrlBuilder:
|
|
@@ -211,9 +211,9 @@ class SnowflakeV2Source(
|
|
|
211
211
|
|
|
212
212
|
self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
|
|
213
213
|
if self.config.include_usage_stats or self.config.include_operational_stats:
|
|
214
|
-
redundant_usage_run_skip_handler: Optional[
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
215
|
+
None
|
|
216
|
+
)
|
|
217
217
|
if self.config.enable_stateful_usage_ingestion:
|
|
218
218
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
219
219
|
source=self,
|
|
@@ -296,7 +296,16 @@ class SnowflakeV2Source(
|
|
|
296
296
|
|
|
297
297
|
_report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
|
|
298
298
|
privileges: List[SnowflakePrivilege] = []
|
|
299
|
-
capabilities: List[SourceCapability] = [
|
|
299
|
+
capabilities: List[SourceCapability] = [
|
|
300
|
+
c.capability
|
|
301
|
+
for c in SnowflakeV2Source.get_capabilities() # type: ignore
|
|
302
|
+
if c.capability
|
|
303
|
+
not in (
|
|
304
|
+
SourceCapability.PLATFORM_INSTANCE,
|
|
305
|
+
SourceCapability.DOMAINS,
|
|
306
|
+
SourceCapability.DELETION_DETECTION,
|
|
307
|
+
)
|
|
308
|
+
]
|
|
300
309
|
|
|
301
310
|
cur = conn.query("select current_role()")
|
|
302
311
|
current_role = [row["CURRENT_ROLE()"] for row in cur][0]
|
|
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
104
104
|
return "\n".join([r for r in res])
|
|
105
105
|
|
|
106
106
|
@typing.no_type_check
|
|
107
|
-
def _get_column_type(
|
|
108
|
-
self, type_: Union[str, Dict[str, Any]]
|
|
109
|
-
) -> TypeEngine: # noqa: C901
|
|
107
|
+
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
|
|
110
108
|
"""Derives the data type of the Athena column.
|
|
111
109
|
|
|
112
110
|
This method is overwritten to extend the behavior of PyAthena.
|