acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -379,7 +379,9 @@ clickhouse_datetime_format = "%Y-%m-%d %H:%M:%S"
|
|
|
379
379
|
@platform_name("ClickHouse")
|
|
380
380
|
@config_class(ClickHouseConfig)
|
|
381
381
|
@support_status(SupportStatus.CERTIFIED)
|
|
382
|
-
@capability(
|
|
382
|
+
@capability(
|
|
383
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
384
|
+
)
|
|
383
385
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
384
386
|
class ClickHouseSource(TwoTierSQLAlchemySource):
|
|
385
387
|
"""
|
|
@@ -26,7 +26,6 @@ class CockroachDBConfig(PostgresConfig):
|
|
|
26
26
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
27
27
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
28
28
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
29
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
30
29
|
class CockroachDBSource(PostgresSource):
|
|
31
30
|
config: CockroachDBConfig
|
|
32
31
|
|
|
@@ -27,7 +27,9 @@ class HanaConfig(BasicSQLAlchemyConfig):
|
|
|
27
27
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
28
28
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
29
29
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
30
|
-
@capability(
|
|
30
|
+
@capability(
|
|
31
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
32
|
+
)
|
|
31
33
|
class HanaSource(SQLAlchemySource):
|
|
32
34
|
def __init__(self, config: HanaConfig, ctx: PipelineContext):
|
|
33
35
|
super().__init__(config, ctx, "hana")
|
|
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
|
|
|
52
52
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
53
53
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
54
54
|
from datahub.metadata.schema_classes import (
|
|
55
|
-
ChangeTypeClass,
|
|
56
55
|
DatasetPropertiesClass,
|
|
57
56
|
SubTypesClass,
|
|
58
57
|
ViewPropertiesClass,
|
|
@@ -161,7 +160,9 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
161
160
|
@platform_name("Hive Metastore")
|
|
162
161
|
@config_class(HiveMetastore)
|
|
163
162
|
@support_status(SupportStatus.CERTIFIED)
|
|
164
|
-
@capability(
|
|
163
|
+
@capability(
|
|
164
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
165
|
+
)
|
|
165
166
|
@capability(SourceCapability.DATA_PROFILING, "Not Supported", False)
|
|
166
167
|
@capability(SourceCapability.CLASSIFICATION, "Not Supported", False)
|
|
167
168
|
@capability(
|
|
@@ -599,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
599
600
|
yield dpi_aspect
|
|
600
601
|
|
|
601
602
|
yield MetadataChangeProposalWrapper(
|
|
602
|
-
entityType="dataset",
|
|
603
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
604
603
|
entityUrn=dataset_urn,
|
|
605
|
-
aspectName="subTypes",
|
|
606
604
|
aspect=SubTypesClass(typeNames=[self.table_subtype]),
|
|
607
605
|
).as_workunit()
|
|
608
606
|
|
|
@@ -808,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
808
806
|
|
|
809
807
|
# Add views subtype
|
|
810
808
|
yield MetadataChangeProposalWrapper(
|
|
811
|
-
entityType="dataset",
|
|
812
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
813
809
|
entityUrn=dataset_urn,
|
|
814
|
-
aspectName="subTypes",
|
|
815
810
|
aspect=SubTypesClass(typeNames=[self.view_subtype]),
|
|
816
811
|
).as_workunit()
|
|
817
812
|
|
|
@@ -822,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
822
817
|
viewLogic=dataset.view_definition if dataset.view_definition else "",
|
|
823
818
|
)
|
|
824
819
|
yield MetadataChangeProposalWrapper(
|
|
825
|
-
entityType="dataset",
|
|
826
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
827
820
|
entityUrn=dataset_urn,
|
|
828
|
-
aspectName="viewProperties",
|
|
829
821
|
aspect=view_properties_aspect,
|
|
830
822
|
).as_workunit()
|
|
831
823
|
|
|
@@ -15,7 +15,6 @@ from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource
|
|
|
15
15
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
16
16
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
17
17
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
18
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
19
18
|
class MariaDBSource(MySQLSource):
|
|
20
19
|
def get_platform(self):
|
|
21
20
|
return "mariadb"
|
|
@@ -27,6 +27,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
27
27
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
28
28
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
29
29
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
30
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
30
31
|
from datahub.ingestion.source.sql.mssql.job_models import (
|
|
31
32
|
JobStep,
|
|
32
33
|
MSSQLDataFlow,
|
|
@@ -174,7 +175,22 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
174
175
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
175
176
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
176
177
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
177
|
-
@capability(
|
|
178
|
+
@capability(
|
|
179
|
+
SourceCapability.LINEAGE_COARSE,
|
|
180
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_lineage`",
|
|
181
|
+
subtype_modifier=[
|
|
182
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
183
|
+
SourceCapabilityModifier.VIEW,
|
|
184
|
+
],
|
|
185
|
+
)
|
|
186
|
+
@capability(
|
|
187
|
+
SourceCapability.LINEAGE_FINE,
|
|
188
|
+
"Enabled by default to get lineage for stored procedures via `include_lineage` and for views via `include_view_column_lineage`",
|
|
189
|
+
subtype_modifier=[
|
|
190
|
+
SourceCapabilityModifier.STORED_PROCEDURE,
|
|
191
|
+
SourceCapabilityModifier.VIEW,
|
|
192
|
+
],
|
|
193
|
+
)
|
|
178
194
|
class SQLServerSource(SQLAlchemySource):
|
|
179
195
|
"""
|
|
180
196
|
This plugin extracts the following:
|
|
@@ -323,9 +339,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
323
339
|
try:
|
|
324
340
|
yield from self.loop_jobs(inspector, self.config)
|
|
325
341
|
except Exception as e:
|
|
326
|
-
self.report.
|
|
327
|
-
"jobs",
|
|
328
|
-
|
|
342
|
+
self.report.failure(
|
|
343
|
+
message="Failed to list jobs",
|
|
344
|
+
title="SQL Server Jobs Extraction",
|
|
345
|
+
context="Error occurred during database-level job extraction",
|
|
346
|
+
exc=e,
|
|
329
347
|
)
|
|
330
348
|
|
|
331
349
|
def get_schema_level_workunits(
|
|
@@ -343,12 +361,158 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
343
361
|
try:
|
|
344
362
|
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
345
363
|
except Exception as e:
|
|
346
|
-
self.report.
|
|
347
|
-
"
|
|
348
|
-
|
|
364
|
+
self.report.failure(
|
|
365
|
+
message="Failed to list stored procedures",
|
|
366
|
+
title="SQL Server Stored Procedures Extraction",
|
|
367
|
+
context="Error occurred during schema-level stored procedure extraction",
|
|
368
|
+
exc=e,
|
|
349
369
|
)
|
|
350
370
|
|
|
371
|
+
def _detect_rds_environment(self, conn: Connection) -> bool:
|
|
372
|
+
"""
|
|
373
|
+
Detect if we're running in an RDS/managed environment vs on-premises.
|
|
374
|
+
Returns True if RDS/managed, False if on-premises.
|
|
375
|
+
"""
|
|
376
|
+
try:
|
|
377
|
+
# Try to access system tables directly - this typically fails in RDS
|
|
378
|
+
conn.execute("SELECT TOP 1 * FROM msdb.dbo.sysjobs")
|
|
379
|
+
logger.debug(
|
|
380
|
+
"Direct table access successful - likely on-premises environment"
|
|
381
|
+
)
|
|
382
|
+
return False
|
|
383
|
+
except Exception:
|
|
384
|
+
logger.debug("Direct table access failed - likely RDS/managed environment")
|
|
385
|
+
return True
|
|
386
|
+
|
|
351
387
|
def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]:
|
|
388
|
+
"""
|
|
389
|
+
Get job information with environment detection to choose optimal method first.
|
|
390
|
+
"""
|
|
391
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
392
|
+
|
|
393
|
+
# Detect environment to choose optimal method first
|
|
394
|
+
is_rds = self._detect_rds_environment(conn)
|
|
395
|
+
|
|
396
|
+
if is_rds:
|
|
397
|
+
# Managed environment - try stored procedures first
|
|
398
|
+
try:
|
|
399
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
400
|
+
logger.info(
|
|
401
|
+
"Successfully retrieved jobs using stored procedures (managed environment)"
|
|
402
|
+
)
|
|
403
|
+
return jobs
|
|
404
|
+
except Exception as sp_error:
|
|
405
|
+
logger.warning(
|
|
406
|
+
f"Failed to retrieve jobs via stored procedures in managed environment: {sp_error}"
|
|
407
|
+
)
|
|
408
|
+
# Try direct query as fallback (might work in some managed environments)
|
|
409
|
+
try:
|
|
410
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
411
|
+
logger.info(
|
|
412
|
+
"Successfully retrieved jobs using direct query fallback in managed environment"
|
|
413
|
+
)
|
|
414
|
+
return jobs
|
|
415
|
+
except Exception as direct_error:
|
|
416
|
+
self.report.failure(
|
|
417
|
+
message="Failed to retrieve jobs in managed environment",
|
|
418
|
+
title="SQL Server Jobs Extraction",
|
|
419
|
+
context="Both stored procedures and direct query methods failed",
|
|
420
|
+
exc=direct_error,
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
# On-premises environment - try direct query first (usually faster)
|
|
424
|
+
try:
|
|
425
|
+
jobs = self._get_jobs_via_direct_query(conn, db_name)
|
|
426
|
+
logger.info(
|
|
427
|
+
"Successfully retrieved jobs using direct query (on-premises environment)"
|
|
428
|
+
)
|
|
429
|
+
return jobs
|
|
430
|
+
except Exception as direct_error:
|
|
431
|
+
logger.warning(
|
|
432
|
+
f"Failed to retrieve jobs via direct query in on-premises environment: {direct_error}"
|
|
433
|
+
)
|
|
434
|
+
# Try stored procedures as fallback
|
|
435
|
+
try:
|
|
436
|
+
jobs = self._get_jobs_via_stored_procedures(conn, db_name)
|
|
437
|
+
logger.info(
|
|
438
|
+
"Successfully retrieved jobs using stored procedures fallback in on-premises environment"
|
|
439
|
+
)
|
|
440
|
+
return jobs
|
|
441
|
+
except Exception as sp_error:
|
|
442
|
+
self.report.failure(
|
|
443
|
+
message="Failed to retrieve jobs in on-premises environment",
|
|
444
|
+
title="SQL Server Jobs Extraction",
|
|
445
|
+
context="Both direct query and stored procedures methods failed",
|
|
446
|
+
exc=sp_error,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
return jobs
|
|
450
|
+
|
|
451
|
+
def _get_jobs_via_stored_procedures(
|
|
452
|
+
self, conn: Connection, db_name: str
|
|
453
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
454
|
+
jobs: Dict[str, Dict[str, Any]] = {}
|
|
455
|
+
|
|
456
|
+
# First, get all jobs
|
|
457
|
+
jobs_result = conn.execute("EXEC msdb.dbo.sp_help_job")
|
|
458
|
+
jobs_data = {}
|
|
459
|
+
|
|
460
|
+
for row in jobs_result:
|
|
461
|
+
job_id = str(row["job_id"])
|
|
462
|
+
jobs_data[job_id] = {
|
|
463
|
+
"job_id": job_id,
|
|
464
|
+
"name": row["name"],
|
|
465
|
+
"description": row.get("description", ""),
|
|
466
|
+
"date_created": row.get("date_created"),
|
|
467
|
+
"date_modified": row.get("date_modified"),
|
|
468
|
+
"enabled": row.get("enabled", 1),
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
# Now get job steps for each job, filtering by database
|
|
472
|
+
for job_id, job_info in jobs_data.items():
|
|
473
|
+
try:
|
|
474
|
+
# Get steps for this specific job
|
|
475
|
+
steps_result = conn.execute(
|
|
476
|
+
f"EXEC msdb.dbo.sp_help_jobstep @job_id = '{job_id}'"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
job_steps = {}
|
|
480
|
+
for step_row in steps_result:
|
|
481
|
+
# Only include steps that run against our target database
|
|
482
|
+
step_database = step_row.get("database_name", "")
|
|
483
|
+
if step_database.lower() == db_name.lower() or not step_database:
|
|
484
|
+
step_data = {
|
|
485
|
+
"job_id": job_id,
|
|
486
|
+
"job_name": job_info["name"],
|
|
487
|
+
"description": job_info["description"],
|
|
488
|
+
"date_created": job_info["date_created"],
|
|
489
|
+
"date_modified": job_info["date_modified"],
|
|
490
|
+
"step_id": step_row["step_id"],
|
|
491
|
+
"step_name": step_row["step_name"],
|
|
492
|
+
"subsystem": step_row.get("subsystem", ""),
|
|
493
|
+
"command": step_row.get("command", ""),
|
|
494
|
+
"database_name": step_database,
|
|
495
|
+
}
|
|
496
|
+
job_steps[step_row["step_id"]] = step_data
|
|
497
|
+
|
|
498
|
+
# Only add job if it has relevant steps
|
|
499
|
+
if job_steps:
|
|
500
|
+
jobs[job_info["name"]] = job_steps
|
|
501
|
+
|
|
502
|
+
except Exception as step_error:
|
|
503
|
+
logger.warning(
|
|
504
|
+
f"Failed to get steps for job {job_info['name']}: {step_error}"
|
|
505
|
+
)
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
return jobs
|
|
509
|
+
|
|
510
|
+
def _get_jobs_via_direct_query(
|
|
511
|
+
self, conn: Connection, db_name: str
|
|
512
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
513
|
+
"""
|
|
514
|
+
Original method using direct table access for on-premises SQL Server.
|
|
515
|
+
"""
|
|
352
516
|
jobs_data = conn.execute(
|
|
353
517
|
f"""
|
|
354
518
|
SELECT
|
|
@@ -371,6 +535,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
371
535
|
where database_name = '{db_name}'
|
|
372
536
|
"""
|
|
373
537
|
)
|
|
538
|
+
|
|
374
539
|
jobs: Dict[str, Dict[str, Any]] = {}
|
|
375
540
|
for row in jobs_data:
|
|
376
541
|
step_data = dict(
|
|
@@ -383,11 +548,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
383
548
|
step_name=row["step_name"],
|
|
384
549
|
subsystem=row["subsystem"],
|
|
385
550
|
command=row["command"],
|
|
551
|
+
database_name=row["database_name"],
|
|
386
552
|
)
|
|
387
553
|
if row["name"] in jobs:
|
|
388
554
|
jobs[row["name"]][row["step_id"]] = step_data
|
|
389
555
|
else:
|
|
390
556
|
jobs[row["name"]] = {row["step_id"]: step_data}
|
|
557
|
+
|
|
391
558
|
return jobs
|
|
392
559
|
|
|
393
560
|
def loop_jobs(
|
|
@@ -397,21 +564,59 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
397
564
|
) -> Iterable[MetadataWorkUnit]:
|
|
398
565
|
"""
|
|
399
566
|
Loop MS SQL jobs as dataFlow-s.
|
|
400
|
-
|
|
567
|
+
Now supports both managed and on-premises SQL Server.
|
|
401
568
|
"""
|
|
402
569
|
db_name = self.get_db_name(inspector)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
570
|
+
|
|
571
|
+
try:
|
|
572
|
+
with inspector.engine.connect() as conn:
|
|
573
|
+
jobs = self._get_jobs(conn, db_name)
|
|
574
|
+
|
|
575
|
+
if not jobs:
|
|
576
|
+
logger.info(f"No jobs found for database: {db_name}")
|
|
577
|
+
return
|
|
578
|
+
|
|
579
|
+
logger.info(f"Found {len(jobs)} jobs for database: {db_name}")
|
|
580
|
+
|
|
581
|
+
for job_name, job_steps in jobs.items():
|
|
582
|
+
try:
|
|
583
|
+
job = MSSQLJob(
|
|
584
|
+
name=job_name,
|
|
585
|
+
env=sql_config.env,
|
|
586
|
+
db=db_name,
|
|
587
|
+
platform_instance=sql_config.platform_instance,
|
|
588
|
+
)
|
|
589
|
+
data_flow = MSSQLDataFlow(entity=job)
|
|
590
|
+
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
591
|
+
yield from self.loop_job_steps(job, job_steps)
|
|
592
|
+
|
|
593
|
+
except Exception as job_error:
|
|
594
|
+
logger.warning(f"Failed to process job {job_name}: {job_error}")
|
|
595
|
+
self.report.warning(
|
|
596
|
+
message=f"Failed to process job {job_name}",
|
|
597
|
+
title="SQL Server Jobs Extraction",
|
|
598
|
+
context="Error occurred while processing individual job",
|
|
599
|
+
exc=job_error,
|
|
600
|
+
)
|
|
601
|
+
continue
|
|
602
|
+
|
|
603
|
+
except Exception as e:
|
|
604
|
+
error_message = f"Failed to retrieve jobs for database {db_name}: {e}"
|
|
605
|
+
logger.error(error_message)
|
|
606
|
+
|
|
607
|
+
# Provide specific guidance for permission issues
|
|
608
|
+
if "permission" in str(e).lower() or "denied" in str(e).lower():
|
|
609
|
+
permission_guidance = (
|
|
610
|
+
"For managed SQL Server services, ensure the following permissions are granted:\n"
|
|
611
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_job TO datahub_read;\n"
|
|
612
|
+
"GRANT EXECUTE ON msdb.dbo.sp_help_jobstep TO datahub_read;\n"
|
|
613
|
+
"For on-premises SQL Server, you may also need:\n"
|
|
614
|
+
"GRANT SELECT ON msdb.dbo.sysjobs TO datahub_read;\n"
|
|
615
|
+
"GRANT SELECT ON msdb.dbo.sysjobsteps TO datahub_read;"
|
|
411
616
|
)
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
617
|
+
logger.info(permission_guidance)
|
|
618
|
+
|
|
619
|
+
raise e
|
|
415
620
|
|
|
416
621
|
def loop_job_steps(
|
|
417
622
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
@@ -740,25 +945,25 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
740
945
|
url = self.config.get_sql_alchemy_url()
|
|
741
946
|
logger.debug(f"sql_alchemy_url={url}")
|
|
742
947
|
engine = create_engine(url, **self.config.options)
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
948
|
+
|
|
949
|
+
if self.config.database and self.config.database != "":
|
|
950
|
+
inspector = inspect(engine)
|
|
951
|
+
yield inspector
|
|
952
|
+
else:
|
|
953
|
+
with engine.begin() as conn:
|
|
748
954
|
databases = conn.execute(
|
|
749
955
|
"SELECT name FROM master.sys.databases WHERE name NOT IN \
|
|
750
956
|
('master', 'model', 'msdb', 'tempdb', 'Resource', \
|
|
751
957
|
'distribution' , 'reportserver', 'reportservertempdb'); "
|
|
752
|
-
)
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
yield inspector
|
|
958
|
+
).fetchall()
|
|
959
|
+
|
|
960
|
+
for db in databases:
|
|
961
|
+
if self.config.database_pattern.allowed(db["name"]):
|
|
962
|
+
url = self.config.get_sql_alchemy_url(current_db=db["name"])
|
|
963
|
+
engine = create_engine(url, **self.config.options)
|
|
964
|
+
inspector = inspect(engine)
|
|
965
|
+
self.current_database = db["name"]
|
|
966
|
+
yield inspector
|
|
762
967
|
|
|
763
968
|
def get_identifier(
|
|
764
969
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
@@ -65,7 +65,6 @@ class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
|
65
65
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
66
66
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
67
67
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
68
|
-
@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
|
|
69
68
|
class MySQLSource(TwoTierSQLAlchemySource):
|
|
70
69
|
"""
|
|
71
70
|
This plugin extracts the following:
|
|
@@ -441,7 +441,7 @@ class OracleInspectorObjectWrapper:
|
|
|
441
441
|
"\nac.constraint_name,"
|
|
442
442
|
"\nac.constraint_type,"
|
|
443
443
|
"\nacc.column_name AS local_column,"
|
|
444
|
-
"\nac.
|
|
444
|
+
"\nac.table_name AS remote_table,"
|
|
445
445
|
"\nrcc.column_name AS remote_column,"
|
|
446
446
|
"\nac.r_owner AS remote_owner,"
|
|
447
447
|
"\nacc.position AS loc_pos,"
|
|
@@ -131,7 +131,6 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
131
131
|
@capability(SourceCapability.DOMAINS, "Enabled by default")
|
|
132
132
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
133
133
|
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
134
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration")
|
|
135
134
|
class PostgresSource(SQLAlchemySource):
|
|
136
135
|
"""
|
|
137
136
|
This plugin extracts the following:
|