acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
|
@@ -23,7 +23,6 @@ from datahub.ingestion.api.incremental_properties_helper import (
|
|
|
23
23
|
from datahub.ingestion.api.source import (
|
|
24
24
|
CapabilityReport,
|
|
25
25
|
MetadataWorkUnitProcessor,
|
|
26
|
-
Source,
|
|
27
26
|
SourceCapability,
|
|
28
27
|
SourceReport,
|
|
29
28
|
TestableSource,
|
|
@@ -212,9 +211,9 @@ class SnowflakeV2Source(
|
|
|
212
211
|
|
|
213
212
|
self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
|
|
214
213
|
if self.config.include_usage_stats or self.config.include_operational_stats:
|
|
215
|
-
redundant_usage_run_skip_handler: Optional[
|
|
216
|
-
|
|
217
|
-
|
|
214
|
+
redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = (
|
|
215
|
+
None
|
|
216
|
+
)
|
|
218
217
|
if self.config.enable_stateful_usage_ingestion:
|
|
219
218
|
redundant_usage_run_skip_handler = RedundantUsageRunSkipHandler(
|
|
220
219
|
source=self,
|
|
@@ -251,11 +250,6 @@ class SnowflakeV2Source(
|
|
|
251
250
|
|
|
252
251
|
self.add_config_to_report()
|
|
253
252
|
|
|
254
|
-
@classmethod
|
|
255
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
|
256
|
-
config = SnowflakeV2Config.parse_obj(config_dict)
|
|
257
|
-
return cls(ctx, config)
|
|
258
|
-
|
|
259
253
|
@staticmethod
|
|
260
254
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
261
255
|
test_report = TestConnectionReport()
|
|
@@ -302,7 +296,16 @@ class SnowflakeV2Source(
|
|
|
302
296
|
|
|
303
297
|
_report: Dict[Union[SourceCapability, str], CapabilityReport] = dict()
|
|
304
298
|
privileges: List[SnowflakePrivilege] = []
|
|
305
|
-
capabilities: List[SourceCapability] = [
|
|
299
|
+
capabilities: List[SourceCapability] = [
|
|
300
|
+
c.capability
|
|
301
|
+
for c in SnowflakeV2Source.get_capabilities() # type: ignore
|
|
302
|
+
if c.capability
|
|
303
|
+
not in (
|
|
304
|
+
SourceCapability.PLATFORM_INSTANCE,
|
|
305
|
+
SourceCapability.DOMAINS,
|
|
306
|
+
SourceCapability.DELETION_DETECTION,
|
|
307
|
+
)
|
|
308
|
+
]
|
|
306
309
|
|
|
307
310
|
cur = conn.query("select current_role()")
|
|
308
311
|
current_role = [row["CURRENT_ROLE()"] for row in cur][0]
|
|
@@ -480,8 +483,8 @@ class SnowflakeV2Source(
|
|
|
480
483
|
identifiers=self.identifiers,
|
|
481
484
|
)
|
|
482
485
|
|
|
483
|
-
self.report.
|
|
484
|
-
|
|
486
|
+
with self.report.new_stage(f"*: {METADATA_EXTRACTION}"):
|
|
487
|
+
yield from schema_extractor.get_workunits_internal()
|
|
485
488
|
|
|
486
489
|
databases = schema_extractor.databases
|
|
487
490
|
|
|
@@ -513,45 +516,46 @@ class SnowflakeV2Source(
|
|
|
513
516
|
discovered_datasets = discovered_tables + discovered_views
|
|
514
517
|
|
|
515
518
|
if self.config.use_queries_v2:
|
|
516
|
-
self.report.
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
|
|
520
|
-
|
|
521
|
-
schema_resolver = self.aggregator._schema_resolver
|
|
522
|
-
|
|
523
|
-
queries_extractor = SnowflakeQueriesExtractor(
|
|
524
|
-
connection=self.connection,
|
|
525
|
-
config=SnowflakeQueriesExtractorConfig(
|
|
526
|
-
window=self.config,
|
|
527
|
-
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
528
|
-
include_lineage=self.config.include_table_lineage,
|
|
529
|
-
include_usage_statistics=self.config.include_usage_stats,
|
|
530
|
-
include_operations=self.config.include_operational_stats,
|
|
531
|
-
user_email_pattern=self.config.user_email_pattern,
|
|
532
|
-
),
|
|
533
|
-
structured_report=self.report,
|
|
534
|
-
filters=self.filters,
|
|
535
|
-
identifiers=self.identifiers,
|
|
536
|
-
schema_resolver=schema_resolver,
|
|
537
|
-
discovered_tables=discovered_datasets,
|
|
538
|
-
graph=self.ctx.graph,
|
|
539
|
-
)
|
|
519
|
+
with self.report.new_stage(f"*: {VIEW_PARSING}"):
|
|
520
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
540
521
|
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
522
|
+
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
523
|
+
schema_resolver = self.aggregator._schema_resolver
|
|
524
|
+
|
|
525
|
+
queries_extractor = SnowflakeQueriesExtractor(
|
|
526
|
+
connection=self.connection,
|
|
527
|
+
config=SnowflakeQueriesExtractorConfig(
|
|
528
|
+
window=self.config,
|
|
529
|
+
temporary_tables_pattern=self.config.temporary_tables_pattern,
|
|
530
|
+
include_lineage=self.config.include_table_lineage,
|
|
531
|
+
include_usage_statistics=self.config.include_usage_stats,
|
|
532
|
+
include_operations=self.config.include_operational_stats,
|
|
533
|
+
include_queries=self.config.include_queries,
|
|
534
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
535
|
+
user_email_pattern=self.config.user_email_pattern,
|
|
536
|
+
),
|
|
537
|
+
structured_report=self.report,
|
|
538
|
+
filters=self.filters,
|
|
539
|
+
identifiers=self.identifiers,
|
|
540
|
+
schema_resolver=schema_resolver,
|
|
541
|
+
discovered_tables=discovered_datasets,
|
|
542
|
+
graph=self.ctx.graph,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs
|
|
546
|
+
# but a shared schema resolver. That's fine for now though - once we remove the old lineage/usage extractors,
|
|
547
|
+
# it should be pretty straightforward to refactor this and only initialize the aggregator once.
|
|
548
|
+
self.report.queries_extractor = queries_extractor.report
|
|
549
|
+
yield from queries_extractor.get_workunits_internal()
|
|
550
|
+
queries_extractor.close()
|
|
547
551
|
|
|
548
552
|
else:
|
|
549
553
|
if self.lineage_extractor:
|
|
550
|
-
self.report.
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
554
|
+
with self.report.new_stage(f"*: {LINEAGE_EXTRACTION}"):
|
|
555
|
+
self.lineage_extractor.add_time_based_lineage_to_aggregator(
|
|
556
|
+
discovered_tables=discovered_tables,
|
|
557
|
+
discovered_views=discovered_views,
|
|
558
|
+
)
|
|
555
559
|
|
|
556
560
|
# This would emit view and external table ddl lineage
|
|
557
561
|
# as well as query lineage via lineage_extractor
|
|
@@ -104,9 +104,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
104
104
|
return "\n".join([r for r in res])
|
|
105
105
|
|
|
106
106
|
@typing.no_type_check
|
|
107
|
-
def _get_column_type(
|
|
108
|
-
self, type_: Union[str, Dict[str, Any]]
|
|
109
|
-
) -> TypeEngine: # noqa: C901
|
|
107
|
+
def _get_column_type(self, type_: Union[str, Dict[str, Any]]) -> TypeEngine: # noqa: C901
|
|
110
108
|
"""Derives the data type of the Athena column.
|
|
111
109
|
|
|
112
110
|
This method is overwritten to extend the behavior of PyAthena.
|
|
@@ -218,9 +218,7 @@ def _get_all_table_comments_and_properties(self, connection, **kw):
|
|
|
218
218
|
, comment
|
|
219
219
|
, {properties_clause} AS properties
|
|
220
220
|
FROM system.tables
|
|
221
|
-
WHERE name NOT LIKE '.inner%'""".format(
|
|
222
|
-
properties_clause=properties_clause
|
|
223
|
-
)
|
|
221
|
+
WHERE name NOT LIKE '.inner%'""".format(properties_clause=properties_clause)
|
|
224
222
|
)
|
|
225
223
|
|
|
226
224
|
all_table_comments: Dict[Tuple[str, str], Dict[str, Any]] = {}
|
|
@@ -268,7 +266,7 @@ def _get_table_or_view_names(self, relkind, connection, schema=None, **kw):
|
|
|
268
266
|
info_cache = kw.get("info_cache")
|
|
269
267
|
all_relations = self._get_all_relation_info(connection, info_cache=info_cache)
|
|
270
268
|
relation_names = []
|
|
271
|
-
for
|
|
269
|
+
for _, relation in all_relations.items():
|
|
272
270
|
if relation.database == schema and relation.relkind == relkind:
|
|
273
271
|
relation_names.append(relation.relname)
|
|
274
272
|
return relation_names
|
|
@@ -301,9 +299,7 @@ def _get_schema_column_info(self, connection, schema=None, **kw):
|
|
|
301
299
|
, comment
|
|
302
300
|
FROM system.columns
|
|
303
301
|
WHERE {schema_clause}
|
|
304
|
-
ORDER BY database, table, position""".format(
|
|
305
|
-
schema_clause=schema_clause
|
|
306
|
-
)
|
|
302
|
+
ORDER BY database, table, position""".format(schema_clause=schema_clause)
|
|
307
303
|
)
|
|
308
304
|
)
|
|
309
305
|
)
|
|
@@ -474,7 +470,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
474
470
|
logger.debug(f"sql_alchemy_url={url}")
|
|
475
471
|
engine = create_engine(url, **self.config.options)
|
|
476
472
|
for db_row in engine.execute(text(all_tables_query)):
|
|
477
|
-
all_tables_set.add(f
|
|
473
|
+
all_tables_set.add(f"{db_row['database']}.{db_row['table_name']}")
|
|
478
474
|
|
|
479
475
|
return all_tables_set
|
|
480
476
|
|
|
@@ -503,7 +499,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
503
499
|
|
|
504
500
|
try:
|
|
505
501
|
for db_row in engine.execute(text(query)):
|
|
506
|
-
dataset_name = f
|
|
502
|
+
dataset_name = f"{db_row['target_schema']}.{db_row['target_table']}"
|
|
507
503
|
if not self.config.database_pattern.allowed(
|
|
508
504
|
db_row["target_schema"]
|
|
509
505
|
) or not self.config.table_pattern.allowed(dataset_name):
|
|
@@ -512,7 +508,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
512
508
|
|
|
513
509
|
# Target
|
|
514
510
|
target_path = (
|
|
515
|
-
f
|
|
511
|
+
f"{self.config.platform_instance + '.' if self.config.platform_instance else ''}"
|
|
516
512
|
f"{dataset_name}"
|
|
517
513
|
)
|
|
518
514
|
target = LineageItem(
|
|
@@ -525,7 +521,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
525
521
|
|
|
526
522
|
# Source
|
|
527
523
|
platform = LineageDatasetPlatform.CLICKHOUSE
|
|
528
|
-
path = f
|
|
524
|
+
path = f"{db_row['source_schema']}.{db_row['source_table']}"
|
|
529
525
|
|
|
530
526
|
sources = [
|
|
531
527
|
LineageDataset(
|
|
@@ -552,9 +548,7 @@ class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
|
552
548
|
target.dataset.path
|
|
553
549
|
].upstreams = self._lineage_map[
|
|
554
550
|
target.dataset.path
|
|
555
|
-
].upstreams.union(
|
|
556
|
-
target.upstreams
|
|
557
|
-
)
|
|
551
|
+
].upstreams.union(target.upstreams)
|
|
558
552
|
|
|
559
553
|
else:
|
|
560
554
|
self._lineage_map[target.dataset.path] = target
|
|
@@ -234,9 +234,7 @@ class OracleInspectorObjectWrapper:
|
|
|
234
234
|
WHERE col.table_name = id.table_name
|
|
235
235
|
AND col.column_name = id.column_name
|
|
236
236
|
AND col.owner = id.owner
|
|
237
|
-
) AS identity_options""".format(
|
|
238
|
-
dblink=dblink
|
|
239
|
-
)
|
|
237
|
+
) AS identity_options""".format(dblink=dblink)
|
|
240
238
|
else:
|
|
241
239
|
identity_cols = "NULL as default_on_null, NULL as identity_options"
|
|
242
240
|
|
|
@@ -278,8 +278,7 @@ class GenericProfiler:
|
|
|
278
278
|
|
|
279
279
|
if self.config.profiling.profile_table_size_limit is not None and (
|
|
280
280
|
size_in_bytes is not None
|
|
281
|
-
and size_in_bytes / (2**30)
|
|
282
|
-
> self.config.profiling.profile_table_size_limit
|
|
281
|
+
and size_in_bytes / (2**30) > self.config.profiling.profile_table_size_limit
|
|
283
282
|
):
|
|
284
283
|
self.report.profiling_skipped_size_limit[schema_name] += 1
|
|
285
284
|
logger.debug(
|
|
@@ -93,7 +93,7 @@ POSTGRES_TYPES_MAP: Dict[str, Any] = {
|
|
|
93
93
|
"regtype": None,
|
|
94
94
|
"regrole": None,
|
|
95
95
|
"regnamespace": None,
|
|
96
|
-
"super":
|
|
96
|
+
"super": NullType,
|
|
97
97
|
"uuid": StringType,
|
|
98
98
|
"pg_lsn": None,
|
|
99
99
|
"tsvector": None, # text search vector
|
|
@@ -384,7 +384,6 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
|
|
|
384
384
|
"varchar": StringType,
|
|
385
385
|
"char": StringType,
|
|
386
386
|
"varbinary": BytesType,
|
|
387
|
-
"json": RecordType,
|
|
388
387
|
"date": DateType,
|
|
389
388
|
"time": TimeType,
|
|
390
389
|
"timestamp": TimeType,
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
20
20
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
21
21
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
22
22
|
from datahub.metadata.schema_classes import DataPlatformInstanceClass
|
|
23
|
+
from datahub.metadata.urns import StructuredPropertyUrn
|
|
23
24
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
24
25
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
25
26
|
|
|
@@ -75,6 +76,7 @@ def gen_schema_container(
|
|
|
75
76
|
created: Optional[int] = None,
|
|
76
77
|
last_modified: Optional[int] = None,
|
|
77
78
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
79
|
+
structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
|
|
78
80
|
) -> Iterable[MetadataWorkUnit]:
|
|
79
81
|
domain_urn: Optional[str] = None
|
|
80
82
|
if domain_registry:
|
|
@@ -99,6 +101,7 @@ def gen_schema_container(
|
|
|
99
101
|
owner_urn=owner_urn,
|
|
100
102
|
qualified_name=qualified_name,
|
|
101
103
|
extra_properties=extra_properties,
|
|
104
|
+
structured_properties=structured_properties,
|
|
102
105
|
)
|
|
103
106
|
|
|
104
107
|
|
|
@@ -133,6 +136,7 @@ def gen_database_container(
|
|
|
133
136
|
created: Optional[int] = None,
|
|
134
137
|
last_modified: Optional[int] = None,
|
|
135
138
|
extra_properties: Optional[Dict[str, str]] = None,
|
|
139
|
+
structured_properties: Optional[Dict[StructuredPropertyUrn, str]] = None,
|
|
136
140
|
) -> Iterable[MetadataWorkUnit]:
|
|
137
141
|
domain_urn: Optional[str] = None
|
|
138
142
|
if domain_registry:
|
|
@@ -154,6 +158,7 @@ def gen_database_container(
|
|
|
154
158
|
owner_urn=owner_urn,
|
|
155
159
|
qualified_name=qualified_name,
|
|
156
160
|
extra_properties=extra_properties,
|
|
161
|
+
structured_properties=structured_properties,
|
|
157
162
|
)
|
|
158
163
|
|
|
159
164
|
|
|
@@ -599,7 +599,12 @@ ORDER by DataBaseName, TableName;
|
|
|
599
599
|
setattr( # noqa: B010
|
|
600
600
|
TeradataDialect,
|
|
601
601
|
"get_columns",
|
|
602
|
-
lambda self,
|
|
602
|
+
lambda self,
|
|
603
|
+
connection,
|
|
604
|
+
table_name,
|
|
605
|
+
schema=None,
|
|
606
|
+
use_qvci=self.config.use_qvci,
|
|
607
|
+
**kw: optimized_get_columns(
|
|
603
608
|
self,
|
|
604
609
|
connection,
|
|
605
610
|
table_name,
|
|
@@ -613,7 +618,11 @@ ORDER by DataBaseName, TableName;
|
|
|
613
618
|
setattr( # noqa: B010
|
|
614
619
|
TeradataDialect,
|
|
615
620
|
"get_pk_constraint",
|
|
616
|
-
lambda self,
|
|
621
|
+
lambda self,
|
|
622
|
+
connection,
|
|
623
|
+
table_name,
|
|
624
|
+
schema=None,
|
|
625
|
+
**kw: optimized_get_pk_constraint(
|
|
617
626
|
self, connection, table_name, schema, **kw
|
|
618
627
|
),
|
|
619
628
|
)
|
|
@@ -621,7 +630,11 @@ ORDER by DataBaseName, TableName;
|
|
|
621
630
|
setattr( # noqa: B010
|
|
622
631
|
TeradataDialect,
|
|
623
632
|
"get_foreign_keys",
|
|
624
|
-
lambda self,
|
|
633
|
+
lambda self,
|
|
634
|
+
connection,
|
|
635
|
+
table_name,
|
|
636
|
+
schema=None,
|
|
637
|
+
**kw: optimized_get_foreign_keys(
|
|
625
638
|
self, connection, table_name, schema, **kw
|
|
626
639
|
),
|
|
627
640
|
)
|
|
@@ -878,7 +891,7 @@ ORDER by DataBaseName, TableName;
|
|
|
878
891
|
|
|
879
892
|
urns = self.schema_resolver.get_urns()
|
|
880
893
|
if self.config.include_table_lineage or self.config.include_usage_statistics:
|
|
881
|
-
self.report.
|
|
882
|
-
|
|
894
|
+
with self.report.new_stage("Audit log extraction"):
|
|
895
|
+
yield from self.get_audit_log_mcps(urns=urns)
|
|
883
896
|
|
|
884
897
|
yield from self.builder.gen_workunits()
|
|
@@ -41,9 +41,9 @@ class ProfilingHandler(StatefulIngestionUsecaseHandlerBase[ProfilingCheckpointSt
|
|
|
41
41
|
run_id: str,
|
|
42
42
|
):
|
|
43
43
|
self.state_provider = source.state_provider
|
|
44
|
-
self.stateful_ingestion_config: Optional[
|
|
45
|
-
|
|
46
|
-
|
|
44
|
+
self.stateful_ingestion_config: Optional[ProfilingStatefulIngestionConfig] = (
|
|
45
|
+
config.stateful_ingestion
|
|
46
|
+
)
|
|
47
47
|
self.pipeline_name = pipeline_name
|
|
48
48
|
self.run_id = run_id
|
|
49
49
|
self.checkpointing_enabled: bool = (
|
|
@@ -48,9 +48,9 @@ class RedundantRunSkipHandler(
|
|
|
48
48
|
):
|
|
49
49
|
self.source = source
|
|
50
50
|
self.state_provider = source.state_provider
|
|
51
|
-
self.stateful_ingestion_config: Optional[
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
self.stateful_ingestion_config: Optional[StatefulIngestionConfig] = (
|
|
52
|
+
config.stateful_ingestion
|
|
53
|
+
)
|
|
54
54
|
self.pipeline_name = pipeline_name
|
|
55
55
|
self.run_id = run_id
|
|
56
56
|
self._job_id = self._init_job_id()
|
|
@@ -145,8 +145,7 @@ class RedundantRunSkipHandler(
|
|
|
145
145
|
)
|
|
146
146
|
|
|
147
147
|
logger.debug(
|
|
148
|
-
f"{self.job_id} : Last run start, end times:"
|
|
149
|
-
f"({last_run_time_window})"
|
|
148
|
+
f"{self.job_id} : Last run start, end times:({last_run_time_window})"
|
|
150
149
|
)
|
|
151
150
|
|
|
152
151
|
# If current run's time window is subset of last run's time window, then skip.
|
|
@@ -212,8 +211,7 @@ class RedundantRunSkipHandler(
|
|
|
212
211
|
)
|
|
213
212
|
|
|
214
213
|
self.log(
|
|
215
|
-
"Adjusted start, end times: "
|
|
216
|
-
f"({suggested_start_time}, {suggested_end_time})"
|
|
214
|
+
f"Adjusted start, end times: ({suggested_start_time}, {suggested_end_time})"
|
|
217
215
|
)
|
|
218
216
|
return (suggested_start_time, suggested_end_time)
|
|
219
217
|
|
|
@@ -111,9 +111,9 @@ class StaleEntityRemovalHandler(
|
|
|
111
111
|
self.state_type_class = state_type_class
|
|
112
112
|
self.pipeline_name = pipeline_name
|
|
113
113
|
self.run_id = run_id
|
|
114
|
-
self.stateful_ingestion_config: Optional[
|
|
115
|
-
|
|
116
|
-
|
|
114
|
+
self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
|
|
115
|
+
config.stateful_ingestion
|
|
116
|
+
)
|
|
117
117
|
self.checkpointing_enabled: bool = (
|
|
118
118
|
True
|
|
119
119
|
if (
|
|
@@ -70,20 +70,20 @@ class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
70
70
|
self.orchestrator_name, pipeline_name, job_name
|
|
71
71
|
)
|
|
72
72
|
|
|
73
|
-
latest_checkpoint: Optional[
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
73
|
+
latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = (
|
|
74
|
+
self.graph.get_latest_timeseries_value(
|
|
75
|
+
entity_urn=data_job_urn,
|
|
76
|
+
aspect_type=DatahubIngestionCheckpointClass,
|
|
77
|
+
filter_criteria_map={
|
|
78
|
+
"pipelineName": pipeline_name,
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
81
|
)
|
|
82
82
|
if latest_checkpoint:
|
|
83
83
|
logger.debug(
|
|
84
84
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
85
85
|
f" job_name:'{job_name}' found with start_time:"
|
|
86
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
86
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
87
87
|
)
|
|
88
88
|
return latest_checkpoint
|
|
89
89
|
else:
|
|
@@ -67,7 +67,7 @@ class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase):
|
|
|
67
67
|
logger.debug(
|
|
68
68
|
f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}',"
|
|
69
69
|
f" job_name:'{job_name}' found with start_time:"
|
|
70
|
-
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}"
|
|
70
|
+
f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis / 1000)}"
|
|
71
71
|
)
|
|
72
72
|
return latest_checkpoint
|
|
73
73
|
else:
|
|
@@ -33,7 +33,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
33
|
platform_name,
|
|
34
34
|
support_status,
|
|
35
35
|
)
|
|
36
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
36
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
38
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
39
39
|
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
@@ -265,11 +265,6 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
265
265
|
# TODO(Gabe): how should we message about this error?
|
|
266
266
|
return requests_session
|
|
267
267
|
|
|
268
|
-
@classmethod
|
|
269
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
270
|
-
config = SupersetConfig.parse_obj(config_dict)
|
|
271
|
-
return cls(ctx, config)
|
|
272
|
-
|
|
273
268
|
def paginate_entity_api_results(self, entity_type, page_size=100):
|
|
274
269
|
current_page = 0
|
|
275
270
|
total_items = page_size
|