acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -164,6 +164,23 @@ class SnowflakeQuery:
|
|
|
164
164
|
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
165
165
|
order by table_schema, table_name"""
|
|
166
166
|
|
|
167
|
+
@staticmethod
|
|
168
|
+
def procedures_for_database(db_name: Optional[str]) -> str:
|
|
169
|
+
db_clause = f'"{db_name}".' if db_name is not None else ""
|
|
170
|
+
return f"""
|
|
171
|
+
SELECT procedure_catalog AS "PROCEDURE_CATALOG",
|
|
172
|
+
procedure_schema AS "PROCEDURE_SCHEMA",
|
|
173
|
+
procedure_name AS "PROCEDURE_NAME",
|
|
174
|
+
procedure_language AS "PROCEDURE_LANGUAGE",
|
|
175
|
+
argument_signature AS "ARGUMENT_SIGNATURE",
|
|
176
|
+
data_type AS "PROCEDURE_RETURN_TYPE",
|
|
177
|
+
procedure_definition AS "PROCEDURE_DEFINITION",
|
|
178
|
+
created AS "CREATED",
|
|
179
|
+
last_altered AS "LAST_ALTERED",
|
|
180
|
+
comment AS "COMMENT"
|
|
181
|
+
FROM {db_clause}information_schema.procedures
|
|
182
|
+
order by procedure_schema, procedure_name"""
|
|
183
|
+
|
|
167
184
|
@staticmethod
|
|
168
185
|
def get_all_tags():
|
|
169
186
|
return """
|
|
@@ -105,6 +105,7 @@ class SnowflakeV2Report(
|
|
|
105
105
|
databases_scanned: int = 0
|
|
106
106
|
tags_scanned: int = 0
|
|
107
107
|
streams_scanned: int = 0
|
|
108
|
+
procedures_scanned: int = 0
|
|
108
109
|
|
|
109
110
|
include_usage_stats: bool = False
|
|
110
111
|
include_operational_stats: bool = False
|
|
@@ -163,6 +164,8 @@ class SnowflakeV2Report(
|
|
|
163
164
|
self.tags_scanned += 1
|
|
164
165
|
elif ent_type == "stream":
|
|
165
166
|
self.streams_scanned += 1
|
|
167
|
+
elif ent_type == "procedure":
|
|
168
|
+
self.procedures_scanned += 1
|
|
166
169
|
else:
|
|
167
170
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
168
171
|
|
|
@@ -14,6 +14,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import (
|
|
|
14
14
|
SnowflakeQuery,
|
|
15
15
|
)
|
|
16
16
|
from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
|
|
17
|
+
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
17
18
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
18
19
|
from datahub.utilities.prefix_batch_builder import PrefixGroup, build_prefix_batches
|
|
19
20
|
from datahub.utilities.serialized_lru_cache import serialized_lru_cache
|
|
@@ -714,3 +715,31 @@ class SnowflakeDataDictionary(SupportsAsObj):
|
|
|
714
715
|
stream_pagination_marker = stream_name
|
|
715
716
|
|
|
716
717
|
return streams
|
|
718
|
+
|
|
719
|
+
@serialized_lru_cache(maxsize=1)
|
|
720
|
+
def get_procedures_for_database(
|
|
721
|
+
self, db_name: str
|
|
722
|
+
) -> Dict[str, List[BaseProcedure]]:
|
|
723
|
+
procedures: Dict[str, List[BaseProcedure]] = {}
|
|
724
|
+
cur = self.connection.query(
|
|
725
|
+
SnowflakeQuery.procedures_for_database(db_name),
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
for procedure in cur:
|
|
729
|
+
if procedure["PROCEDURE_SCHEMA"] not in procedures:
|
|
730
|
+
procedures[procedure["PROCEDURE_SCHEMA"]] = []
|
|
731
|
+
|
|
732
|
+
procedures[procedure["PROCEDURE_SCHEMA"]].append(
|
|
733
|
+
BaseProcedure(
|
|
734
|
+
name=procedure["PROCEDURE_NAME"],
|
|
735
|
+
language=procedure["PROCEDURE_LANGUAGE"],
|
|
736
|
+
argument_signature=procedure["ARGUMENT_SIGNATURE"],
|
|
737
|
+
return_type=procedure["PROCEDURE_RETURN_TYPE"],
|
|
738
|
+
procedure_definition=procedure["PROCEDURE_DEFINITION"],
|
|
739
|
+
created=procedure["CREATED"],
|
|
740
|
+
last_altered=procedure["LAST_ALTERED"],
|
|
741
|
+
comment=procedure["COMMENT"],
|
|
742
|
+
extra_properties=None,
|
|
743
|
+
)
|
|
744
|
+
)
|
|
745
|
+
return procedures
|
|
@@ -41,6 +41,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
|
|
|
41
41
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
42
42
|
from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
43
43
|
SCHEMA_PARALLELISM,
|
|
44
|
+
BaseProcedure,
|
|
44
45
|
SnowflakeColumn,
|
|
45
46
|
SnowflakeDatabase,
|
|
46
47
|
SnowflakeDataDictionary,
|
|
@@ -63,12 +64,14 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
63
64
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
64
65
|
add_table_to_schema_container,
|
|
65
66
|
gen_database_container,
|
|
66
|
-
gen_database_key,
|
|
67
67
|
gen_schema_container,
|
|
68
|
-
gen_schema_key,
|
|
69
68
|
get_dataplatform_instance_aspect,
|
|
70
69
|
get_domain_wu,
|
|
71
70
|
)
|
|
71
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
72
|
+
generate_procedure_container_workunits,
|
|
73
|
+
generate_procedure_workunits,
|
|
74
|
+
)
|
|
72
75
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
73
76
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
74
77
|
LINEAGE_EXTRACTION,
|
|
@@ -448,10 +451,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
448
451
|
if self.config.include_streams:
|
|
449
452
|
self.report.num_get_streams_for_schema_queries += 1
|
|
450
453
|
streams = self.fetch_streams_for_schema(
|
|
451
|
-
snowflake_schema,
|
|
454
|
+
snowflake_schema,
|
|
455
|
+
db_name,
|
|
452
456
|
)
|
|
453
457
|
yield from self._process_streams(streams, snowflake_schema, db_name)
|
|
454
458
|
|
|
459
|
+
if self.config.include_procedures:
|
|
460
|
+
procedures = self.fetch_procedures_for_schema(snowflake_schema, db_name)
|
|
461
|
+
yield from self._process_procedures(procedures, snowflake_schema, db_name)
|
|
462
|
+
|
|
455
463
|
if self.config.include_technical_schema and snowflake_schema.tags:
|
|
456
464
|
yield from self._process_tags_in_schema(snowflake_schema)
|
|
457
465
|
|
|
@@ -536,6 +544,26 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
536
544
|
for stream in streams:
|
|
537
545
|
yield from self._process_stream(stream, snowflake_schema, db_name)
|
|
538
546
|
|
|
547
|
+
def _process_procedures(
|
|
548
|
+
self,
|
|
549
|
+
procedures: List[BaseProcedure],
|
|
550
|
+
snowflake_schema: SnowflakeSchema,
|
|
551
|
+
db_name: str,
|
|
552
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
553
|
+
if self.config.include_technical_schema:
|
|
554
|
+
if procedures:
|
|
555
|
+
yield from generate_procedure_container_workunits(
|
|
556
|
+
self.identifiers.gen_database_key(
|
|
557
|
+
db_name,
|
|
558
|
+
),
|
|
559
|
+
self.identifiers.gen_schema_key(
|
|
560
|
+
db_name=db_name,
|
|
561
|
+
schema_name=snowflake_schema.name,
|
|
562
|
+
),
|
|
563
|
+
)
|
|
564
|
+
for procedure in procedures:
|
|
565
|
+
yield from self._process_procedure(procedure, snowflake_schema, db_name)
|
|
566
|
+
|
|
539
567
|
def _process_tags_in_schema(
|
|
540
568
|
self, snowflake_schema: SnowflakeSchema
|
|
541
569
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -819,13 +847,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
819
847
|
entityUrn=dataset_urn, aspect=dataset_properties
|
|
820
848
|
).as_workunit()
|
|
821
849
|
|
|
822
|
-
schema_container_key = gen_schema_key(
|
|
823
|
-
db_name=self.snowflake_identifier(db_name),
|
|
824
|
-
schema=self.snowflake_identifier(schema_name),
|
|
825
|
-
platform=self.platform,
|
|
826
|
-
platform_instance=self.config.platform_instance,
|
|
827
|
-
env=self.config.env,
|
|
828
|
-
)
|
|
850
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema_name)
|
|
829
851
|
|
|
830
852
|
if self.config.extract_tags_as_structured_properties:
|
|
831
853
|
yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
|
|
@@ -1094,11 +1116,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1094
1116
|
def gen_database_containers(
|
|
1095
1117
|
self, database: SnowflakeDatabase
|
|
1096
1118
|
) -> Iterable[MetadataWorkUnit]:
|
|
1097
|
-
database_container_key = gen_database_key(
|
|
1098
|
-
|
|
1099
|
-
platform=self.platform,
|
|
1100
|
-
platform_instance=self.config.platform_instance,
|
|
1101
|
-
env=self.config.env,
|
|
1119
|
+
database_container_key = self.identifiers.gen_database_key(
|
|
1120
|
+
database.name,
|
|
1102
1121
|
)
|
|
1103
1122
|
|
|
1104
1123
|
yield from gen_database_container(
|
|
@@ -1147,21 +1166,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1147
1166
|
def gen_schema_containers(
|
|
1148
1167
|
self, schema: SnowflakeSchema, db_name: str
|
|
1149
1168
|
) -> Iterable[MetadataWorkUnit]:
|
|
1150
|
-
|
|
1151
|
-
database_container_key = gen_database_key(
|
|
1152
|
-
database=self.snowflake_identifier(db_name),
|
|
1153
|
-
platform=self.platform,
|
|
1154
|
-
platform_instance=self.config.platform_instance,
|
|
1155
|
-
env=self.config.env,
|
|
1156
|
-
)
|
|
1169
|
+
database_container_key = self.identifiers.gen_database_key(db_name)
|
|
1157
1170
|
|
|
1158
|
-
schema_container_key = gen_schema_key(
|
|
1159
|
-
db_name=self.snowflake_identifier(db_name),
|
|
1160
|
-
schema=schema_name,
|
|
1161
|
-
platform=self.platform,
|
|
1162
|
-
platform_instance=self.config.platform_instance,
|
|
1163
|
-
env=self.config.env,
|
|
1164
|
-
)
|
|
1171
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema.name)
|
|
1165
1172
|
|
|
1166
1173
|
yield from gen_schema_container(
|
|
1167
1174
|
name=schema.name,
|
|
@@ -1290,13 +1297,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1290
1297
|
)
|
|
1291
1298
|
|
|
1292
1299
|
def fetch_streams_for_schema(
|
|
1293
|
-
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1300
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1294
1301
|
) -> List[SnowflakeStream]:
|
|
1295
1302
|
try:
|
|
1296
1303
|
streams: List[SnowflakeStream] = []
|
|
1297
|
-
for stream in self.get_streams_for_schema(
|
|
1304
|
+
for stream in self.get_streams_for_schema(snowflake_schema.name, db_name):
|
|
1298
1305
|
stream_identifier = self.identifiers.get_dataset_identifier(
|
|
1299
|
-
stream.name,
|
|
1306
|
+
stream.name, snowflake_schema.name, db_name
|
|
1300
1307
|
)
|
|
1301
1308
|
|
|
1302
1309
|
self.report.report_entity_scanned(stream_identifier, "stream")
|
|
@@ -1310,16 +1317,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1310
1317
|
snowflake_schema.streams = [stream.name for stream in streams]
|
|
1311
1318
|
return streams
|
|
1312
1319
|
except Exception as e:
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
return []
|
|
1320
|
+
self.structured_reporter.warning(
|
|
1321
|
+
title="Failed to get streams for schema",
|
|
1322
|
+
message="Please check permissions"
|
|
1323
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1324
|
+
else "",
|
|
1325
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1326
|
+
exc=e,
|
|
1327
|
+
)
|
|
1328
|
+
return []
|
|
1323
1329
|
|
|
1324
1330
|
def get_streams_for_schema(
|
|
1325
1331
|
self, schema_name: str, db_name: str
|
|
@@ -1328,6 +1334,42 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1328
1334
|
|
|
1329
1335
|
return streams.get(schema_name, [])
|
|
1330
1336
|
|
|
1337
|
+
def fetch_procedures_for_schema(
|
|
1338
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1339
|
+
) -> List[BaseProcedure]:
|
|
1340
|
+
try:
|
|
1341
|
+
procedures: List[BaseProcedure] = []
|
|
1342
|
+
for procedure in self.get_procedures_for_schema(snowflake_schema, db_name):
|
|
1343
|
+
procedure_qualified_name = self.identifiers.get_dataset_identifier(
|
|
1344
|
+
procedure.name, snowflake_schema.name, db_name
|
|
1345
|
+
)
|
|
1346
|
+
self.report.report_entity_scanned(procedure_qualified_name, "procedure")
|
|
1347
|
+
|
|
1348
|
+
if self.filters.is_procedure_allowed(procedure_qualified_name):
|
|
1349
|
+
procedures.append(procedure)
|
|
1350
|
+
else:
|
|
1351
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1352
|
+
return procedures
|
|
1353
|
+
except Exception as e:
|
|
1354
|
+
self.structured_reporter.warning(
|
|
1355
|
+
title="Failed to get procedures for schema",
|
|
1356
|
+
message="Please check permissions"
|
|
1357
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1358
|
+
else "",
|
|
1359
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1360
|
+
exc=e,
|
|
1361
|
+
)
|
|
1362
|
+
return []
|
|
1363
|
+
|
|
1364
|
+
def get_procedures_for_schema(
|
|
1365
|
+
self,
|
|
1366
|
+
snowflake_schema: SnowflakeSchema,
|
|
1367
|
+
db_name: str,
|
|
1368
|
+
) -> List[BaseProcedure]:
|
|
1369
|
+
procedures = self.data_dictionary.get_procedures_for_database(db_name)
|
|
1370
|
+
|
|
1371
|
+
return procedures.get(snowflake_schema.name, [])
|
|
1372
|
+
|
|
1331
1373
|
def _process_stream(
|
|
1332
1374
|
self,
|
|
1333
1375
|
stream: SnowflakeStream,
|
|
@@ -1350,6 +1392,34 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1350
1392
|
"Failed to get columns for stream:", stream.name, exc=e
|
|
1351
1393
|
)
|
|
1352
1394
|
|
|
1395
|
+
def _process_procedure(
|
|
1396
|
+
self,
|
|
1397
|
+
procedure: BaseProcedure,
|
|
1398
|
+
snowflake_schema: SnowflakeSchema,
|
|
1399
|
+
db_name: str,
|
|
1400
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1401
|
+
try:
|
|
1402
|
+
# TODO: For CLL, we should process procedures after all tables are processed
|
|
1403
|
+
yield from generate_procedure_workunits(
|
|
1404
|
+
procedure,
|
|
1405
|
+
database_key=self.identifiers.gen_database_key(
|
|
1406
|
+
db_name,
|
|
1407
|
+
),
|
|
1408
|
+
schema_key=self.identifiers.gen_schema_key(
|
|
1409
|
+
db_name, snowflake_schema.name
|
|
1410
|
+
),
|
|
1411
|
+
schema_resolver=(
|
|
1412
|
+
self.aggregator._schema_resolver if self.aggregator else None
|
|
1413
|
+
),
|
|
1414
|
+
)
|
|
1415
|
+
except Exception as e:
|
|
1416
|
+
self.structured_reporter.warning(
|
|
1417
|
+
title="Failed to ingest stored procedure",
|
|
1418
|
+
message="",
|
|
1419
|
+
context=procedure.name,
|
|
1420
|
+
exc=e,
|
|
1421
|
+
)
|
|
1422
|
+
|
|
1353
1423
|
def get_columns_for_stream(
|
|
1354
1424
|
self,
|
|
1355
1425
|
source_object: str, # Qualified name of source table/view
|
|
@@ -3,7 +3,10 @@ from functools import cached_property
|
|
|
3
3
|
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
|
-
from datahub.emitter.mce_builder import
|
|
6
|
+
from datahub.emitter.mce_builder import (
|
|
7
|
+
make_dataset_urn_with_platform_instance,
|
|
8
|
+
)
|
|
9
|
+
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
7
10
|
from datahub.ingestion.api.source import SourceReport
|
|
8
11
|
from datahub.ingestion.source.snowflake.constants import (
|
|
9
12
|
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
|
|
@@ -16,6 +19,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
16
19
|
SnowflakeV2Config,
|
|
17
20
|
)
|
|
18
21
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
22
|
+
from datahub.ingestion.source.sql.sql_utils import gen_database_key, gen_schema_key
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
@@ -180,6 +184,9 @@ class SnowflakeFilter:
|
|
|
180
184
|
|
|
181
185
|
return True
|
|
182
186
|
|
|
187
|
+
def is_procedure_allowed(self, procedure_name: str) -> bool:
|
|
188
|
+
return self.filter_config.procedure_pattern.allowed(procedure_name)
|
|
189
|
+
|
|
183
190
|
|
|
184
191
|
def _combine_identifier_parts(
|
|
185
192
|
*, table_name: str, schema_name: str, db_name: str
|
|
@@ -330,6 +337,23 @@ class SnowflakeIdentifierBuilder:
|
|
|
330
337
|
else user_name
|
|
331
338
|
)
|
|
332
339
|
|
|
340
|
+
def gen_schema_key(self, db_name: str, schema_name: str) -> SchemaKey:
|
|
341
|
+
return gen_schema_key(
|
|
342
|
+
db_name=self.snowflake_identifier(db_name),
|
|
343
|
+
schema=self.snowflake_identifier(schema_name),
|
|
344
|
+
platform=self.platform,
|
|
345
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
346
|
+
env=self.identifier_config.env,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def gen_database_key(self, db_name: str) -> DatabaseKey:
|
|
350
|
+
return gen_database_key(
|
|
351
|
+
database=self.snowflake_identifier(db_name),
|
|
352
|
+
platform=self.platform,
|
|
353
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
354
|
+
env=self.identifier_config.env,
|
|
355
|
+
)
|
|
356
|
+
|
|
333
357
|
|
|
334
358
|
class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
335
359
|
platform = "snowflake"
|
|
@@ -15,6 +15,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
15
15
|
FlowContainerSubTypes,
|
|
16
16
|
JobContainerSubTypes,
|
|
17
17
|
)
|
|
18
|
+
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
18
19
|
from datahub.metadata.schema_classes import (
|
|
19
20
|
ContainerClass,
|
|
20
21
|
DataFlowInfoClass,
|
|
@@ -135,6 +136,19 @@ class StoredProcedure:
|
|
|
135
136
|
def escape_full_name(self) -> str:
|
|
136
137
|
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
|
|
137
138
|
|
|
139
|
+
def to_base_procedure(self) -> BaseProcedure:
|
|
140
|
+
return BaseProcedure(
|
|
141
|
+
name=self.formatted_name,
|
|
142
|
+
procedure_definition=self.code,
|
|
143
|
+
created=None,
|
|
144
|
+
last_altered=None,
|
|
145
|
+
comment=None,
|
|
146
|
+
argument_signature=None,
|
|
147
|
+
return_type=None,
|
|
148
|
+
language="SQL",
|
|
149
|
+
extra_properties=None,
|
|
150
|
+
)
|
|
151
|
+
|
|
138
152
|
|
|
139
153
|
@dataclass
|
|
140
154
|
class JobStep:
|
|
@@ -222,7 +236,7 @@ class MSSQLDataJob:
|
|
|
222
236
|
type = (
|
|
223
237
|
JobContainerSubTypes.MSSQL_JOBSTEP
|
|
224
238
|
if isinstance(self.entity, JobStep)
|
|
225
|
-
else JobContainerSubTypes.
|
|
239
|
+
else JobContainerSubTypes.STORED_PROCEDURE
|
|
226
240
|
)
|
|
227
241
|
return SubTypesClass(
|
|
228
242
|
typeNames=[type],
|
|
@@ -37,9 +37,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
37
37
|
ProcedureParameter,
|
|
38
38
|
StoredProcedure,
|
|
39
39
|
)
|
|
40
|
-
from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
|
|
41
|
-
generate_procedure_lineage,
|
|
42
|
-
)
|
|
43
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
44
41
|
SQLAlchemySource,
|
|
45
42
|
SqlWorkUnit,
|
|
@@ -50,6 +47,9 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
50
47
|
make_sqlalchemy_uri,
|
|
51
48
|
)
|
|
52
49
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
50
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
51
|
+
generate_procedure_lineage,
|
|
52
|
+
)
|
|
53
53
|
from datahub.utilities.file_backed_collections import FileBackedList
|
|
54
54
|
|
|
55
55
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -65,6 +65,8 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
65
65
|
# defaults
|
|
66
66
|
host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
|
|
67
67
|
scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
|
|
68
|
+
|
|
69
|
+
# TODO: rename to include_procedures ?
|
|
68
70
|
include_stored_procedures: bool = Field(
|
|
69
71
|
default=True,
|
|
70
72
|
description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
|
|
@@ -763,9 +765,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
763
765
|
yield from auto_workunit(
|
|
764
766
|
generate_procedure_lineage(
|
|
765
767
|
schema_resolver=self.get_schema_resolver(),
|
|
766
|
-
procedure=procedure,
|
|
768
|
+
procedure=procedure.to_base_procedure(),
|
|
767
769
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
768
770
|
is_temp_table=self.is_temp_table,
|
|
771
|
+
default_db=procedure.db,
|
|
772
|
+
default_schema=procedure.schema,
|
|
769
773
|
)
|
|
770
774
|
)
|
|
771
775
|
|
|
@@ -31,7 +31,9 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
31
31
|
SQLAlchemySource,
|
|
32
32
|
make_sqlalchemy_type,
|
|
33
33
|
)
|
|
34
|
-
from datahub.ingestion.source.sql.sql_config import
|
|
34
|
+
from datahub.ingestion.source.sql.sql_config import (
|
|
35
|
+
BasicSQLAlchemyConfig,
|
|
36
|
+
)
|
|
35
37
|
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
37
39
|
|
|
@@ -71,10 +73,12 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
71
73
|
description="Will be set automatically to default value.",
|
|
72
74
|
)
|
|
73
75
|
service_name: Optional[str] = Field(
|
|
74
|
-
default=None,
|
|
76
|
+
default=None,
|
|
77
|
+
description="Oracle service name. If using, omit `database`.",
|
|
75
78
|
)
|
|
76
79
|
database: Optional[str] = Field(
|
|
77
|
-
default=None,
|
|
80
|
+
default=None,
|
|
81
|
+
description="If using, omit `service_name`.",
|
|
78
82
|
)
|
|
79
83
|
add_database_name_to_urn: Optional[bool] = Field(
|
|
80
84
|
default=False,
|
|
@@ -631,7 +635,6 @@ class OracleSource(SQLAlchemySource):
|
|
|
631
635
|
- Table, row, and column statistics via optional SQL profiling
|
|
632
636
|
|
|
633
637
|
Using the Oracle source requires that you've also installed the correct drivers; see the [cx_Oracle docs](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html). The easiest one is the [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client.html).
|
|
634
|
-
|
|
635
638
|
"""
|
|
636
639
|
|
|
637
640
|
config: OracleConfig
|
|
@@ -661,6 +664,8 @@ class OracleSource(SQLAlchemySource):
|
|
|
661
664
|
database name from Connection URL, which does not work when using
|
|
662
665
|
service instead of database.
|
|
663
666
|
In that case, it tries to retrieve the database name by sending a query to the DB.
|
|
667
|
+
|
|
668
|
+
Note: This is used as a fallback if database is not specified in the config.
|
|
664
669
|
"""
|
|
665
670
|
|
|
666
671
|
# call default implementation first
|
|
@@ -687,7 +692,49 @@ class OracleSource(SQLAlchemySource):
|
|
|
687
692
|
# To silent the mypy lint error
|
|
688
693
|
yield cast(Inspector, inspector)
|
|
689
694
|
|
|
695
|
+
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
696
|
+
"""
|
|
697
|
+
Override the get_db_schema method to ensure proper schema name extraction.
|
|
698
|
+
This method is used during view lineage extraction to determine the default schema
|
|
699
|
+
for unqualified table names in view definitions.
|
|
700
|
+
"""
|
|
701
|
+
try:
|
|
702
|
+
# Try to get the schema from the dataset identifier
|
|
703
|
+
parts = dataset_identifier.split(".")
|
|
704
|
+
|
|
705
|
+
# Handle the identifier format differently based on add_database_name_to_urn flag
|
|
706
|
+
if self.config.add_database_name_to_urn:
|
|
707
|
+
if len(parts) >= 3:
|
|
708
|
+
# Format is: database.schema.view when add_database_name_to_urn=True
|
|
709
|
+
db_name = parts[-3]
|
|
710
|
+
schema_name = parts[-2]
|
|
711
|
+
return db_name, schema_name
|
|
712
|
+
elif len(parts) >= 2:
|
|
713
|
+
# Handle the case where database might be missing even with flag enabled
|
|
714
|
+
# If we have a database in the config, use that
|
|
715
|
+
db_name = str(self.config.database)
|
|
716
|
+
schema_name = parts[-2]
|
|
717
|
+
return db_name, schema_name
|
|
718
|
+
else:
|
|
719
|
+
# Format is: schema.view when add_database_name_to_urn=False
|
|
720
|
+
if len(parts) >= 2:
|
|
721
|
+
# When add_database_name_to_urn is False, don't include database in the result
|
|
722
|
+
db_name = None
|
|
723
|
+
schema_name = parts[-2]
|
|
724
|
+
return db_name, schema_name
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.warning(
|
|
727
|
+
f"Error extracting schema from identifier {dataset_identifier}: {e}"
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Fall back to parent implementation if our approach fails
|
|
731
|
+
db_name, schema_name = super().get_db_schema(dataset_identifier)
|
|
732
|
+
return db_name, schema_name
|
|
733
|
+
|
|
690
734
|
def get_workunits(self):
|
|
735
|
+
"""
|
|
736
|
+
Override get_workunits to patch Oracle dialect for custom types.
|
|
737
|
+
"""
|
|
691
738
|
with patch.dict(
|
|
692
739
|
"sqlalchemy.dialects.oracle.base.OracleDialect.ischema_names",
|
|
693
740
|
{klass.__name__: klass for klass in extra_oracle_types},
|
|
File without changes
|