acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -54,6 +54,7 @@ from datahub.ingestion.source.common.data_reader import DataReader
|
|
|
54
54
|
from datahub.ingestion.source.common.subtypes import (
|
|
55
55
|
DatasetContainerSubTypes,
|
|
56
56
|
DatasetSubTypes,
|
|
57
|
+
SourceCapabilityModifier,
|
|
57
58
|
)
|
|
58
59
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
59
60
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
@@ -76,33 +77,36 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
76
77
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
77
78
|
StatefulIngestionSourceBase,
|
|
78
79
|
)
|
|
79
|
-
from datahub.metadata.
|
|
80
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
81
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
82
|
-
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
80
|
+
from datahub.metadata.schema_classes import (
|
|
83
81
|
ArrayTypeClass,
|
|
84
82
|
BooleanTypeClass,
|
|
85
83
|
BytesTypeClass,
|
|
84
|
+
DataPlatformInstanceClass,
|
|
85
|
+
DatasetLineageTypeClass,
|
|
86
|
+
DatasetPropertiesClass,
|
|
87
|
+
DatasetSnapshotClass,
|
|
86
88
|
DateTypeClass,
|
|
87
89
|
EnumTypeClass,
|
|
88
|
-
|
|
89
|
-
|
|
90
|
+
FineGrainedLineageClass,
|
|
91
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
92
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
93
|
+
ForeignKeyConstraintClass,
|
|
94
|
+
GlobalTagsClass,
|
|
95
|
+
MetadataChangeEventClass,
|
|
96
|
+
MySqlDDLClass,
|
|
90
97
|
NullTypeClass,
|
|
91
98
|
NumberTypeClass,
|
|
92
99
|
RecordTypeClass,
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
100
|
+
SchemaFieldClass,
|
|
101
|
+
SchemaFieldDataTypeClass,
|
|
102
|
+
SchemaMetadataClass,
|
|
103
|
+
StatusClass,
|
|
96
104
|
StringTypeClass,
|
|
97
|
-
TimeTypeClass,
|
|
98
|
-
)
|
|
99
|
-
from datahub.metadata.schema_classes import (
|
|
100
|
-
DataPlatformInstanceClass,
|
|
101
|
-
DatasetLineageTypeClass,
|
|
102
|
-
DatasetPropertiesClass,
|
|
103
|
-
GlobalTagsClass,
|
|
104
105
|
SubTypesClass,
|
|
105
106
|
TagAssociationClass,
|
|
107
|
+
TimeTypeClass,
|
|
108
|
+
UpstreamClass,
|
|
109
|
+
UpstreamLineageClass,
|
|
106
110
|
ViewPropertiesClass,
|
|
107
111
|
)
|
|
108
112
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -112,6 +116,7 @@ from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
|
112
116
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
113
117
|
get_native_data_type_for_sqlalchemy_type,
|
|
114
118
|
)
|
|
119
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
115
120
|
|
|
116
121
|
if TYPE_CHECKING:
|
|
117
122
|
from datahub.ingestion.source.ge_data_profiler import (
|
|
@@ -198,7 +203,7 @@ def make_sqlalchemy_type(name: str) -> Type[TypeEngine]:
|
|
|
198
203
|
|
|
199
204
|
def get_column_type(
|
|
200
205
|
sql_report: SQLSourceReport, dataset_name: str, column_type: Any
|
|
201
|
-
) ->
|
|
206
|
+
) -> SchemaFieldDataTypeClass:
|
|
202
207
|
"""
|
|
203
208
|
Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
|
|
204
209
|
"""
|
|
@@ -223,7 +228,7 @@ def get_column_type(
|
|
|
223
228
|
)
|
|
224
229
|
TypeClass = NullTypeClass
|
|
225
230
|
|
|
226
|
-
return
|
|
231
|
+
return SchemaFieldDataTypeClass(type=TypeClass())
|
|
227
232
|
|
|
228
233
|
|
|
229
234
|
def get_schema_metadata(
|
|
@@ -232,10 +237,10 @@ def get_schema_metadata(
|
|
|
232
237
|
platform: str,
|
|
233
238
|
columns: List[dict],
|
|
234
239
|
pk_constraints: Optional[dict] = None,
|
|
235
|
-
foreign_keys: Optional[List[
|
|
236
|
-
canonical_schema: Optional[List[
|
|
240
|
+
foreign_keys: Optional[List[ForeignKeyConstraintClass]] = None,
|
|
241
|
+
canonical_schema: Optional[List[SchemaFieldClass]] = None,
|
|
237
242
|
simplify_nested_field_paths: bool = False,
|
|
238
|
-
) ->
|
|
243
|
+
) -> SchemaMetadataClass:
|
|
239
244
|
if (
|
|
240
245
|
simplify_nested_field_paths
|
|
241
246
|
and canonical_schema is not None
|
|
@@ -243,12 +248,12 @@ def get_schema_metadata(
|
|
|
243
248
|
):
|
|
244
249
|
canonical_schema = downgrade_schema_from_v2(canonical_schema)
|
|
245
250
|
|
|
246
|
-
schema_metadata =
|
|
251
|
+
schema_metadata = SchemaMetadataClass(
|
|
247
252
|
schemaName=dataset_name,
|
|
248
253
|
platform=make_data_platform_urn(platform),
|
|
249
254
|
version=0,
|
|
250
255
|
hash="",
|
|
251
|
-
platformSchema=
|
|
256
|
+
platformSchema=MySqlDDLClass(tableSchema=""),
|
|
252
257
|
fields=canonical_schema or [],
|
|
253
258
|
)
|
|
254
259
|
if foreign_keys is not None and foreign_keys != []:
|
|
@@ -287,6 +292,10 @@ class ProfileMetadata:
|
|
|
287
292
|
SourceCapability.CONTAINERS,
|
|
288
293
|
"Enabled by default",
|
|
289
294
|
supported=True,
|
|
295
|
+
subtype_modifier=[
|
|
296
|
+
SourceCapabilityModifier.DATABASE,
|
|
297
|
+
SourceCapabilityModifier.SCHEMA,
|
|
298
|
+
],
|
|
290
299
|
)
|
|
291
300
|
@capability(
|
|
292
301
|
SourceCapability.DESCRIPTIONS,
|
|
@@ -298,6 +307,20 @@ class ProfileMetadata:
|
|
|
298
307
|
"Enabled by default",
|
|
299
308
|
supported=True,
|
|
300
309
|
)
|
|
310
|
+
@capability(
|
|
311
|
+
SourceCapability.LINEAGE_COARSE,
|
|
312
|
+
"Enabled by default to get lineage for views via `include_view_lineage`",
|
|
313
|
+
subtype_modifier=[SourceCapabilityModifier.VIEW],
|
|
314
|
+
)
|
|
315
|
+
@capability(
|
|
316
|
+
SourceCapability.LINEAGE_FINE,
|
|
317
|
+
"Enabled by default to get lineage for views via `include_view_column_lineage`",
|
|
318
|
+
subtype_modifier=[SourceCapabilityModifier.VIEW],
|
|
319
|
+
)
|
|
320
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
321
|
+
@capability(
|
|
322
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
323
|
+
)
|
|
301
324
|
class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
302
325
|
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
|
|
303
326
|
|
|
@@ -570,6 +593,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
570
593
|
)
|
|
571
594
|
|
|
572
595
|
# Generate workunit for aggregated SQL parsing results
|
|
596
|
+
yield from self._generate_aggregator_workunits()
|
|
597
|
+
|
|
598
|
+
def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
599
|
+
"""Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
|
|
573
600
|
for mcp in self.aggregator.gen_metadata():
|
|
574
601
|
yield mcp.as_workunit()
|
|
575
602
|
|
|
@@ -590,7 +617,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
590
617
|
schema: str,
|
|
591
618
|
fk_dict: Dict[str, str],
|
|
592
619
|
inspector: Inspector,
|
|
593
|
-
) ->
|
|
620
|
+
) -> ForeignKeyConstraintClass:
|
|
594
621
|
referred_schema: Optional[str] = fk_dict.get("referred_schema")
|
|
595
622
|
|
|
596
623
|
if not referred_schema:
|
|
@@ -617,7 +644,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
617
644
|
for f in fk_dict["referred_columns"]
|
|
618
645
|
]
|
|
619
646
|
|
|
620
|
-
return
|
|
647
|
+
return ForeignKeyConstraintClass(
|
|
621
648
|
fk_dict["name"], foreign_fields, source_fields, foreign_dataset
|
|
622
649
|
)
|
|
623
650
|
|
|
@@ -714,7 +741,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
714
741
|
self.config.platform_instance,
|
|
715
742
|
self.config.env,
|
|
716
743
|
)
|
|
717
|
-
dataset_snapshot =
|
|
744
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
718
745
|
urn=dataset_urn,
|
|
719
746
|
aspects=[StatusClass(removed=False)],
|
|
720
747
|
)
|
|
@@ -742,6 +769,30 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
742
769
|
tags=extra_tags,
|
|
743
770
|
partition_keys=partitions,
|
|
744
771
|
)
|
|
772
|
+
|
|
773
|
+
if self.config.include_table_location_lineage and location_urn:
|
|
774
|
+
self.aggregator.add_known_lineage_mapping(
|
|
775
|
+
upstream_urn=location_urn,
|
|
776
|
+
downstream_urn=dataset_snapshot.urn,
|
|
777
|
+
lineage_type=DatasetLineageTypeClass.COPY,
|
|
778
|
+
)
|
|
779
|
+
external_upstream_table = UpstreamClass(
|
|
780
|
+
dataset=location_urn,
|
|
781
|
+
type=DatasetLineageTypeClass.COPY,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
yield MetadataChangeProposalWrapper(
|
|
785
|
+
entityUrn=dataset_snapshot.urn,
|
|
786
|
+
aspect=UpstreamLineageClass(
|
|
787
|
+
upstreams=[external_upstream_table],
|
|
788
|
+
fineGrainedLineages=self.get_fine_grained_lineages(
|
|
789
|
+
dataset_urn=dataset_snapshot.urn,
|
|
790
|
+
upstream_dataset_urn=location_urn,
|
|
791
|
+
schema_fields=schema_fields,
|
|
792
|
+
),
|
|
793
|
+
),
|
|
794
|
+
).as_workunit()
|
|
795
|
+
|
|
745
796
|
schema_metadata = get_schema_metadata(
|
|
746
797
|
self.report,
|
|
747
798
|
dataset_name,
|
|
@@ -762,7 +813,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
762
813
|
yield from self.add_table_to_schema_container(
|
|
763
814
|
dataset_urn=dataset_urn, db_name=db_name, schema=schema
|
|
764
815
|
)
|
|
765
|
-
mce =
|
|
816
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
766
817
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
767
818
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
768
819
|
if dpi_aspect:
|
|
@@ -797,7 +848,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
797
848
|
schema: str,
|
|
798
849
|
table: str,
|
|
799
850
|
data_reader: Optional[DataReader],
|
|
800
|
-
schema_metadata:
|
|
851
|
+
schema_metadata: SchemaMetadataClass,
|
|
801
852
|
) -> None:
|
|
802
853
|
try:
|
|
803
854
|
if (
|
|
@@ -908,7 +959,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
908
959
|
|
|
909
960
|
def _get_foreign_keys(
|
|
910
961
|
self, dataset_urn: str, inspector: Inspector, schema: str, table: str
|
|
911
|
-
) -> List[
|
|
962
|
+
) -> List[ForeignKeyConstraintClass]:
|
|
912
963
|
try:
|
|
913
964
|
foreign_keys = [
|
|
914
965
|
self.get_foreign_key_metadata(dataset_urn, schema, fk_rec, inspector)
|
|
@@ -922,6 +973,42 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
922
973
|
foreign_keys = []
|
|
923
974
|
return foreign_keys
|
|
924
975
|
|
|
976
|
+
def get_fine_grained_lineages(
|
|
977
|
+
self,
|
|
978
|
+
dataset_urn: str,
|
|
979
|
+
upstream_dataset_urn: str,
|
|
980
|
+
schema_fields: List[SchemaFieldClass],
|
|
981
|
+
) -> Optional[List[FineGrainedLineageClass]]:
|
|
982
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
983
|
+
|
|
984
|
+
for schema_field in schema_fields:
|
|
985
|
+
try:
|
|
986
|
+
field_path_v1 = get_simple_field_path_from_v2_field_path(
|
|
987
|
+
schema_field.fieldPath
|
|
988
|
+
)
|
|
989
|
+
fine_grained_lineages.append(
|
|
990
|
+
FineGrainedLineageClass(
|
|
991
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
992
|
+
downstreams=[make_schema_field_urn(dataset_urn, field_path_v1)],
|
|
993
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
994
|
+
upstreams=[
|
|
995
|
+
make_schema_field_urn(
|
|
996
|
+
upstream_dataset_urn,
|
|
997
|
+
get_simple_field_path_from_v2_field_path(
|
|
998
|
+
schema_field.fieldPath
|
|
999
|
+
),
|
|
1000
|
+
)
|
|
1001
|
+
],
|
|
1002
|
+
)
|
|
1003
|
+
)
|
|
1004
|
+
except Exception as e:
|
|
1005
|
+
logger.warning(
|
|
1006
|
+
f"Error processing field path for {dataset_urn}: {str(e)}"
|
|
1007
|
+
)
|
|
1008
|
+
continue
|
|
1009
|
+
|
|
1010
|
+
return fine_grained_lineages if fine_grained_lineages else None
|
|
1011
|
+
|
|
925
1012
|
def get_schema_fields(
|
|
926
1013
|
self,
|
|
927
1014
|
dataset_name: str,
|
|
@@ -930,7 +1017,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
930
1017
|
pk_constraints: Optional[dict] = None,
|
|
931
1018
|
partition_keys: Optional[List[str]] = None,
|
|
932
1019
|
tags: Optional[Dict[str, List[str]]] = None,
|
|
933
|
-
) -> List[
|
|
1020
|
+
) -> List[SchemaFieldClass]:
|
|
934
1021
|
canonical_schema = []
|
|
935
1022
|
for column in columns:
|
|
936
1023
|
column_tags: Optional[List[str]] = None
|
|
@@ -955,14 +1042,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
955
1042
|
pk_constraints: Optional[dict] = None,
|
|
956
1043
|
partition_keys: Optional[List[str]] = None,
|
|
957
1044
|
tags: Optional[List[str]] = None,
|
|
958
|
-
) -> List[
|
|
1045
|
+
) -> List[SchemaFieldClass]:
|
|
959
1046
|
gtc: Optional[GlobalTagsClass] = None
|
|
960
1047
|
if tags:
|
|
961
1048
|
tags_str = [make_tag_urn(t) for t in tags]
|
|
962
1049
|
tags_tac = [TagAssociationClass(t) for t in tags_str]
|
|
963
1050
|
gtc = GlobalTagsClass(tags_tac)
|
|
964
1051
|
full_type = column.get("full_type")
|
|
965
|
-
field =
|
|
1052
|
+
field = SchemaFieldClass(
|
|
966
1053
|
fieldPath=column["name"],
|
|
967
1054
|
type=get_column_type(self.report, dataset_name, column["type"]),
|
|
968
1055
|
nativeDataType=(
|
|
@@ -1092,7 +1179,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1092
1179
|
default_schema=default_schema,
|
|
1093
1180
|
)
|
|
1094
1181
|
|
|
1095
|
-
dataset_snapshot =
|
|
1182
|
+
dataset_snapshot = DatasetSnapshotClass(
|
|
1096
1183
|
urn=dataset_urn,
|
|
1097
1184
|
aspects=[StatusClass(removed=False)],
|
|
1098
1185
|
)
|
|
@@ -1111,7 +1198,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1111
1198
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
1112
1199
|
if schema_metadata:
|
|
1113
1200
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
1114
|
-
mce =
|
|
1201
|
+
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)
|
|
1115
1202
|
yield SqlWorkUnit(id=dataset_name, mce=mce)
|
|
1116
1203
|
dpi_aspect = self.get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
1117
1204
|
if dpi_aspect:
|
|
@@ -57,10 +57,11 @@ class GenericProfiler:
|
|
|
57
57
|
platform: Optional[str] = None,
|
|
58
58
|
profiler_args: Optional[Dict] = None,
|
|
59
59
|
) -> Iterable[MetadataWorkUnit]:
|
|
60
|
+
# We don't run ge profiling queries if table profiling is enabled or if the row count is 0.
|
|
60
61
|
ge_profile_requests: List[GEProfilerRequest] = [
|
|
61
62
|
cast(GEProfilerRequest, request)
|
|
62
63
|
for request in requests
|
|
63
|
-
if not request.profile_table_level_only
|
|
64
|
+
if not request.profile_table_level_only or request.table.rows_count == 0
|
|
64
65
|
]
|
|
65
66
|
table_level_profile_requests: List[TableProfilerRequest] = [
|
|
66
67
|
request for request in requests if request.profile_table_level_only
|