acryl-datahub 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/METADATA +2541 -2541
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/RECORD +28 -26
- datahub/_version.py +1 -1
- datahub/emitter/rest_emitter.py +2 -2
- datahub/ingestion/graph/client.py +6 -11
- datahub/ingestion/source/common/subtypes.py +1 -1
- datahub/ingestion/source/mlflow.py +19 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/metadata/schema.avsc +2 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
- datahub/sql_parsing/split_statements.py +5 -1
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -41,6 +41,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
|
|
|
41
41
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
42
42
|
from datahub.ingestion.source.snowflake.snowflake_schema import (
|
|
43
43
|
SCHEMA_PARALLELISM,
|
|
44
|
+
BaseProcedure,
|
|
44
45
|
SnowflakeColumn,
|
|
45
46
|
SnowflakeDatabase,
|
|
46
47
|
SnowflakeDataDictionary,
|
|
@@ -63,12 +64,14 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
63
64
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
64
65
|
add_table_to_schema_container,
|
|
65
66
|
gen_database_container,
|
|
66
|
-
gen_database_key,
|
|
67
67
|
gen_schema_container,
|
|
68
|
-
gen_schema_key,
|
|
69
68
|
get_dataplatform_instance_aspect,
|
|
70
69
|
get_domain_wu,
|
|
71
70
|
)
|
|
71
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
72
|
+
generate_procedure_container_workunits,
|
|
73
|
+
generate_procedure_workunits,
|
|
74
|
+
)
|
|
72
75
|
from datahub.ingestion.source_report.ingestion_stage import (
|
|
73
76
|
EXTERNAL_TABLE_DDL_LINEAGE,
|
|
74
77
|
LINEAGE_EXTRACTION,
|
|
@@ -448,10 +451,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
448
451
|
if self.config.include_streams:
|
|
449
452
|
self.report.num_get_streams_for_schema_queries += 1
|
|
450
453
|
streams = self.fetch_streams_for_schema(
|
|
451
|
-
snowflake_schema,
|
|
454
|
+
snowflake_schema,
|
|
455
|
+
db_name,
|
|
452
456
|
)
|
|
453
457
|
yield from self._process_streams(streams, snowflake_schema, db_name)
|
|
454
458
|
|
|
459
|
+
if self.config.include_procedures:
|
|
460
|
+
procedures = self.fetch_procedures_for_schema(snowflake_schema, db_name)
|
|
461
|
+
yield from self._process_procedures(procedures, snowflake_schema, db_name)
|
|
462
|
+
|
|
455
463
|
if self.config.include_technical_schema and snowflake_schema.tags:
|
|
456
464
|
yield from self._process_tags_in_schema(snowflake_schema)
|
|
457
465
|
|
|
@@ -536,6 +544,26 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
536
544
|
for stream in streams:
|
|
537
545
|
yield from self._process_stream(stream, snowflake_schema, db_name)
|
|
538
546
|
|
|
547
|
+
def _process_procedures(
|
|
548
|
+
self,
|
|
549
|
+
procedures: List[BaseProcedure],
|
|
550
|
+
snowflake_schema: SnowflakeSchema,
|
|
551
|
+
db_name: str,
|
|
552
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
553
|
+
if self.config.include_technical_schema:
|
|
554
|
+
if procedures:
|
|
555
|
+
yield from generate_procedure_container_workunits(
|
|
556
|
+
self.identifiers.gen_database_key(
|
|
557
|
+
db_name,
|
|
558
|
+
),
|
|
559
|
+
self.identifiers.gen_schema_key(
|
|
560
|
+
db_name=db_name,
|
|
561
|
+
schema_name=snowflake_schema.name,
|
|
562
|
+
),
|
|
563
|
+
)
|
|
564
|
+
for procedure in procedures:
|
|
565
|
+
yield from self._process_procedure(procedure, snowflake_schema, db_name)
|
|
566
|
+
|
|
539
567
|
def _process_tags_in_schema(
|
|
540
568
|
self, snowflake_schema: SnowflakeSchema
|
|
541
569
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -819,13 +847,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
819
847
|
entityUrn=dataset_urn, aspect=dataset_properties
|
|
820
848
|
).as_workunit()
|
|
821
849
|
|
|
822
|
-
schema_container_key = gen_schema_key(
|
|
823
|
-
db_name=self.snowflake_identifier(db_name),
|
|
824
|
-
schema=self.snowflake_identifier(schema_name),
|
|
825
|
-
platform=self.platform,
|
|
826
|
-
platform_instance=self.config.platform_instance,
|
|
827
|
-
env=self.config.env,
|
|
828
|
-
)
|
|
850
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema_name)
|
|
829
851
|
|
|
830
852
|
if self.config.extract_tags_as_structured_properties:
|
|
831
853
|
yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
|
|
@@ -1094,11 +1116,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1094
1116
|
def gen_database_containers(
|
|
1095
1117
|
self, database: SnowflakeDatabase
|
|
1096
1118
|
) -> Iterable[MetadataWorkUnit]:
|
|
1097
|
-
database_container_key = gen_database_key(
|
|
1098
|
-
|
|
1099
|
-
platform=self.platform,
|
|
1100
|
-
platform_instance=self.config.platform_instance,
|
|
1101
|
-
env=self.config.env,
|
|
1119
|
+
database_container_key = self.identifiers.gen_database_key(
|
|
1120
|
+
database.name,
|
|
1102
1121
|
)
|
|
1103
1122
|
|
|
1104
1123
|
yield from gen_database_container(
|
|
@@ -1147,21 +1166,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1147
1166
|
def gen_schema_containers(
|
|
1148
1167
|
self, schema: SnowflakeSchema, db_name: str
|
|
1149
1168
|
) -> Iterable[MetadataWorkUnit]:
|
|
1150
|
-
|
|
1151
|
-
database_container_key = gen_database_key(
|
|
1152
|
-
database=self.snowflake_identifier(db_name),
|
|
1153
|
-
platform=self.platform,
|
|
1154
|
-
platform_instance=self.config.platform_instance,
|
|
1155
|
-
env=self.config.env,
|
|
1156
|
-
)
|
|
1169
|
+
database_container_key = self.identifiers.gen_database_key(db_name)
|
|
1157
1170
|
|
|
1158
|
-
schema_container_key = gen_schema_key(
|
|
1159
|
-
db_name=self.snowflake_identifier(db_name),
|
|
1160
|
-
schema=schema_name,
|
|
1161
|
-
platform=self.platform,
|
|
1162
|
-
platform_instance=self.config.platform_instance,
|
|
1163
|
-
env=self.config.env,
|
|
1164
|
-
)
|
|
1171
|
+
schema_container_key = self.identifiers.gen_schema_key(db_name, schema.name)
|
|
1165
1172
|
|
|
1166
1173
|
yield from gen_schema_container(
|
|
1167
1174
|
name=schema.name,
|
|
@@ -1290,13 +1297,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1290
1297
|
)
|
|
1291
1298
|
|
|
1292
1299
|
def fetch_streams_for_schema(
|
|
1293
|
-
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1300
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1294
1301
|
) -> List[SnowflakeStream]:
|
|
1295
1302
|
try:
|
|
1296
1303
|
streams: List[SnowflakeStream] = []
|
|
1297
|
-
for stream in self.get_streams_for_schema(
|
|
1304
|
+
for stream in self.get_streams_for_schema(snowflake_schema.name, db_name):
|
|
1298
1305
|
stream_identifier = self.identifiers.get_dataset_identifier(
|
|
1299
|
-
stream.name,
|
|
1306
|
+
stream.name, snowflake_schema.name, db_name
|
|
1300
1307
|
)
|
|
1301
1308
|
|
|
1302
1309
|
self.report.report_entity_scanned(stream_identifier, "stream")
|
|
@@ -1310,16 +1317,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1310
1317
|
snowflake_schema.streams = [stream.name for stream in streams]
|
|
1311
1318
|
return streams
|
|
1312
1319
|
except Exception as e:
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
return []
|
|
1320
|
+
self.structured_reporter.warning(
|
|
1321
|
+
title="Failed to get streams for schema",
|
|
1322
|
+
message="Please check permissions"
|
|
1323
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1324
|
+
else "",
|
|
1325
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1326
|
+
exc=e,
|
|
1327
|
+
)
|
|
1328
|
+
return []
|
|
1323
1329
|
|
|
1324
1330
|
def get_streams_for_schema(
|
|
1325
1331
|
self, schema_name: str, db_name: str
|
|
@@ -1328,6 +1334,42 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1328
1334
|
|
|
1329
1335
|
return streams.get(schema_name, [])
|
|
1330
1336
|
|
|
1337
|
+
def fetch_procedures_for_schema(
|
|
1338
|
+
self, snowflake_schema: SnowflakeSchema, db_name: str
|
|
1339
|
+
) -> List[BaseProcedure]:
|
|
1340
|
+
try:
|
|
1341
|
+
procedures: List[BaseProcedure] = []
|
|
1342
|
+
for procedure in self.get_procedures_for_schema(snowflake_schema, db_name):
|
|
1343
|
+
procedure_qualified_name = self.identifiers.get_dataset_identifier(
|
|
1344
|
+
procedure.name, snowflake_schema.name, db_name
|
|
1345
|
+
)
|
|
1346
|
+
self.report.report_entity_scanned(procedure_qualified_name, "procedure")
|
|
1347
|
+
|
|
1348
|
+
if self.filters.is_procedure_allowed(procedure_qualified_name):
|
|
1349
|
+
procedures.append(procedure)
|
|
1350
|
+
else:
|
|
1351
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
1352
|
+
return procedures
|
|
1353
|
+
except Exception as e:
|
|
1354
|
+
self.structured_reporter.warning(
|
|
1355
|
+
title="Failed to get procedures for schema",
|
|
1356
|
+
message="Please check permissions"
|
|
1357
|
+
if isinstance(e, SnowflakePermissionError)
|
|
1358
|
+
else "",
|
|
1359
|
+
context=f"{db_name}.{snowflake_schema.name}",
|
|
1360
|
+
exc=e,
|
|
1361
|
+
)
|
|
1362
|
+
return []
|
|
1363
|
+
|
|
1364
|
+
def get_procedures_for_schema(
|
|
1365
|
+
self,
|
|
1366
|
+
snowflake_schema: SnowflakeSchema,
|
|
1367
|
+
db_name: str,
|
|
1368
|
+
) -> List[BaseProcedure]:
|
|
1369
|
+
procedures = self.data_dictionary.get_procedures_for_database(db_name)
|
|
1370
|
+
|
|
1371
|
+
return procedures.get(snowflake_schema.name, [])
|
|
1372
|
+
|
|
1331
1373
|
def _process_stream(
|
|
1332
1374
|
self,
|
|
1333
1375
|
stream: SnowflakeStream,
|
|
@@ -1350,6 +1392,34 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
|
|
|
1350
1392
|
"Failed to get columns for stream:", stream.name, exc=e
|
|
1351
1393
|
)
|
|
1352
1394
|
|
|
1395
|
+
def _process_procedure(
|
|
1396
|
+
self,
|
|
1397
|
+
procedure: BaseProcedure,
|
|
1398
|
+
snowflake_schema: SnowflakeSchema,
|
|
1399
|
+
db_name: str,
|
|
1400
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1401
|
+
try:
|
|
1402
|
+
# TODO: For CLL, we should process procedures after all tables are processed
|
|
1403
|
+
yield from generate_procedure_workunits(
|
|
1404
|
+
procedure,
|
|
1405
|
+
database_key=self.identifiers.gen_database_key(
|
|
1406
|
+
db_name,
|
|
1407
|
+
),
|
|
1408
|
+
schema_key=self.identifiers.gen_schema_key(
|
|
1409
|
+
db_name, snowflake_schema.name
|
|
1410
|
+
),
|
|
1411
|
+
schema_resolver=(
|
|
1412
|
+
self.aggregator._schema_resolver if self.aggregator else None
|
|
1413
|
+
),
|
|
1414
|
+
)
|
|
1415
|
+
except Exception as e:
|
|
1416
|
+
self.structured_reporter.warning(
|
|
1417
|
+
title="Failed to ingest stored procedure",
|
|
1418
|
+
message="",
|
|
1419
|
+
context=procedure.name,
|
|
1420
|
+
exc=e,
|
|
1421
|
+
)
|
|
1422
|
+
|
|
1353
1423
|
def get_columns_for_stream(
|
|
1354
1424
|
self,
|
|
1355
1425
|
source_object: str, # Qualified name of source table/view
|
|
@@ -3,7 +3,10 @@ from functools import cached_property
|
|
|
3
3
|
from typing import ClassVar, List, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from datahub.configuration.pattern_utils import is_schema_allowed
|
|
6
|
-
from datahub.emitter.mce_builder import
|
|
6
|
+
from datahub.emitter.mce_builder import (
|
|
7
|
+
make_dataset_urn_with_platform_instance,
|
|
8
|
+
)
|
|
9
|
+
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
7
10
|
from datahub.ingestion.api.source import SourceReport
|
|
8
11
|
from datahub.ingestion.source.snowflake.constants import (
|
|
9
12
|
SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
|
|
@@ -16,6 +19,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
16
19
|
SnowflakeV2Config,
|
|
17
20
|
)
|
|
18
21
|
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
|
|
22
|
+
from datahub.ingestion.source.sql.sql_utils import gen_database_key, gen_schema_key
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
class SnowflakeStructuredReportMixin(abc.ABC):
|
|
@@ -180,6 +184,9 @@ class SnowflakeFilter:
|
|
|
180
184
|
|
|
181
185
|
return True
|
|
182
186
|
|
|
187
|
+
def is_procedure_allowed(self, procedure_name: str) -> bool:
|
|
188
|
+
return self.filter_config.procedure_pattern.allowed(procedure_name)
|
|
189
|
+
|
|
183
190
|
|
|
184
191
|
def _combine_identifier_parts(
|
|
185
192
|
*, table_name: str, schema_name: str, db_name: str
|
|
@@ -330,6 +337,23 @@ class SnowflakeIdentifierBuilder:
|
|
|
330
337
|
else user_name
|
|
331
338
|
)
|
|
332
339
|
|
|
340
|
+
def gen_schema_key(self, db_name: str, schema_name: str) -> SchemaKey:
|
|
341
|
+
return gen_schema_key(
|
|
342
|
+
db_name=self.snowflake_identifier(db_name),
|
|
343
|
+
schema=self.snowflake_identifier(schema_name),
|
|
344
|
+
platform=self.platform,
|
|
345
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
346
|
+
env=self.identifier_config.env,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def gen_database_key(self, db_name: str) -> DatabaseKey:
|
|
350
|
+
return gen_database_key(
|
|
351
|
+
database=self.snowflake_identifier(db_name),
|
|
352
|
+
platform=self.platform,
|
|
353
|
+
platform_instance=self.identifier_config.platform_instance,
|
|
354
|
+
env=self.identifier_config.env,
|
|
355
|
+
)
|
|
356
|
+
|
|
333
357
|
|
|
334
358
|
class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
|
|
335
359
|
platform = "snowflake"
|
|
@@ -15,6 +15,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
15
15
|
FlowContainerSubTypes,
|
|
16
16
|
JobContainerSubTypes,
|
|
17
17
|
)
|
|
18
|
+
from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
|
|
18
19
|
from datahub.metadata.schema_classes import (
|
|
19
20
|
ContainerClass,
|
|
20
21
|
DataFlowInfoClass,
|
|
@@ -135,6 +136,19 @@ class StoredProcedure:
|
|
|
135
136
|
def escape_full_name(self) -> str:
|
|
136
137
|
return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
|
|
137
138
|
|
|
139
|
+
def to_base_procedure(self) -> BaseProcedure:
|
|
140
|
+
return BaseProcedure(
|
|
141
|
+
name=self.formatted_name,
|
|
142
|
+
procedure_definition=self.code,
|
|
143
|
+
created=None,
|
|
144
|
+
last_altered=None,
|
|
145
|
+
comment=None,
|
|
146
|
+
argument_signature=None,
|
|
147
|
+
return_type=None,
|
|
148
|
+
language="SQL",
|
|
149
|
+
extra_properties=None,
|
|
150
|
+
)
|
|
151
|
+
|
|
138
152
|
|
|
139
153
|
@dataclass
|
|
140
154
|
class JobStep:
|
|
@@ -222,7 +236,7 @@ class MSSQLDataJob:
|
|
|
222
236
|
type = (
|
|
223
237
|
JobContainerSubTypes.MSSQL_JOBSTEP
|
|
224
238
|
if isinstance(self.entity, JobStep)
|
|
225
|
-
else JobContainerSubTypes.
|
|
239
|
+
else JobContainerSubTypes.STORED_PROCEDURE
|
|
226
240
|
)
|
|
227
241
|
return SubTypesClass(
|
|
228
242
|
typeNames=[type],
|
|
@@ -37,9 +37,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
37
37
|
ProcedureParameter,
|
|
38
38
|
StoredProcedure,
|
|
39
39
|
)
|
|
40
|
-
from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
|
|
41
|
-
generate_procedure_lineage,
|
|
42
|
-
)
|
|
43
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
44
41
|
SQLAlchemySource,
|
|
45
42
|
SqlWorkUnit,
|
|
@@ -50,6 +47,9 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
50
47
|
make_sqlalchemy_uri,
|
|
51
48
|
)
|
|
52
49
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
50
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
51
|
+
generate_procedure_lineage,
|
|
52
|
+
)
|
|
53
53
|
from datahub.utilities.file_backed_collections import FileBackedList
|
|
54
54
|
|
|
55
55
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -65,6 +65,8 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
65
65
|
# defaults
|
|
66
66
|
host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
|
|
67
67
|
scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
|
|
68
|
+
|
|
69
|
+
# TODO: rename to include_procedures ?
|
|
68
70
|
include_stored_procedures: bool = Field(
|
|
69
71
|
default=True,
|
|
70
72
|
description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
|
|
@@ -763,9 +765,11 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
763
765
|
yield from auto_workunit(
|
|
764
766
|
generate_procedure_lineage(
|
|
765
767
|
schema_resolver=self.get_schema_resolver(),
|
|
766
|
-
procedure=procedure,
|
|
768
|
+
procedure=procedure.to_base_procedure(),
|
|
767
769
|
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
768
770
|
is_temp_table=self.is_temp_table,
|
|
771
|
+
default_db=procedure.db,
|
|
772
|
+
default_schema=procedure.schema,
|
|
769
773
|
)
|
|
770
774
|
)
|
|
771
775
|
|
|
File without changes
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Callable, Dict, Iterable, Optional
|
|
4
|
+
|
|
5
|
+
from datahub.emitter.mce_builder import (
|
|
6
|
+
DEFAULT_ENV,
|
|
7
|
+
datahub_guid,
|
|
8
|
+
make_data_flow_urn,
|
|
9
|
+
make_data_job_urn,
|
|
10
|
+
make_data_platform_urn,
|
|
11
|
+
make_dataplatform_instance_urn,
|
|
12
|
+
)
|
|
13
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
14
|
+
from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
|
|
15
|
+
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
16
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
17
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
18
|
+
FlowContainerSubTypes,
|
|
19
|
+
JobContainerSubTypes,
|
|
20
|
+
)
|
|
21
|
+
from datahub.ingestion.source.sql.stored_procedures.lineage import parse_procedure_code
|
|
22
|
+
from datahub.metadata.schema_classes import (
|
|
23
|
+
ContainerClass,
|
|
24
|
+
DataFlowInfoClass,
|
|
25
|
+
DataJobInfoClass,
|
|
26
|
+
DataPlatformInstanceClass,
|
|
27
|
+
DataTransformClass,
|
|
28
|
+
DataTransformLogicClass,
|
|
29
|
+
QueryStatementClass,
|
|
30
|
+
SubTypesClass,
|
|
31
|
+
)
|
|
32
|
+
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class BaseProcedure:
|
|
37
|
+
name: str
|
|
38
|
+
procedure_definition: Optional[str]
|
|
39
|
+
created: Optional[datetime]
|
|
40
|
+
last_altered: Optional[datetime]
|
|
41
|
+
comment: Optional[str]
|
|
42
|
+
argument_signature: Optional[str]
|
|
43
|
+
return_type: Optional[str]
|
|
44
|
+
language: str
|
|
45
|
+
extra_properties: Optional[Dict[str, str]]
|
|
46
|
+
|
|
47
|
+
def get_procedure_identifier(
|
|
48
|
+
self,
|
|
49
|
+
) -> str:
|
|
50
|
+
if self.argument_signature:
|
|
51
|
+
argument_signature_hash = datahub_guid(
|
|
52
|
+
dict(argument_signature=self.argument_signature)
|
|
53
|
+
)
|
|
54
|
+
return f"{self.name}_{argument_signature_hash}"
|
|
55
|
+
|
|
56
|
+
return self.name
|
|
57
|
+
|
|
58
|
+
def to_urn(self, database_key: DatabaseKey, schema_key: Optional[SchemaKey]) -> str:
|
|
59
|
+
return make_data_job_urn(
|
|
60
|
+
orchestrator=database_key.platform,
|
|
61
|
+
flow_id=_get_procedure_flow_name(database_key, schema_key),
|
|
62
|
+
job_id=self.get_procedure_identifier(),
|
|
63
|
+
cluster=database_key.env or DEFAULT_ENV,
|
|
64
|
+
platform_instance=database_key.instance,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _generate_flow_workunits(
|
|
69
|
+
database_key: DatabaseKey, schema_key: Optional[SchemaKey]
|
|
70
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
71
|
+
"""Generate flow workunits for database and schema"""
|
|
72
|
+
|
|
73
|
+
procedure_flow_name = _get_procedure_flow_name(database_key, schema_key)
|
|
74
|
+
|
|
75
|
+
flow_urn = make_data_flow_urn(
|
|
76
|
+
orchestrator=database_key.platform,
|
|
77
|
+
flow_id=procedure_flow_name,
|
|
78
|
+
cluster=database_key.env or DEFAULT_ENV,
|
|
79
|
+
platform_instance=database_key.instance,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
yield MetadataChangeProposalWrapper(
|
|
83
|
+
entityUrn=flow_urn,
|
|
84
|
+
aspect=DataFlowInfoClass(
|
|
85
|
+
name=procedure_flow_name,
|
|
86
|
+
),
|
|
87
|
+
).as_workunit()
|
|
88
|
+
|
|
89
|
+
yield MetadataChangeProposalWrapper(
|
|
90
|
+
entityUrn=flow_urn,
|
|
91
|
+
aspect=SubTypesClass(
|
|
92
|
+
typeNames=[FlowContainerSubTypes.MSSQL_PROCEDURE_CONTAINER],
|
|
93
|
+
),
|
|
94
|
+
).as_workunit()
|
|
95
|
+
|
|
96
|
+
if database_key.instance:
|
|
97
|
+
yield MetadataChangeProposalWrapper(
|
|
98
|
+
entityUrn=flow_urn,
|
|
99
|
+
aspect=DataPlatformInstanceClass(
|
|
100
|
+
platform=make_data_platform_urn(database_key.platform),
|
|
101
|
+
instance=make_dataplatform_instance_urn(
|
|
102
|
+
platform=database_key.platform,
|
|
103
|
+
instance=database_key.instance,
|
|
104
|
+
),
|
|
105
|
+
),
|
|
106
|
+
).as_workunit()
|
|
107
|
+
|
|
108
|
+
yield MetadataChangeProposalWrapper(
|
|
109
|
+
entityUrn=flow_urn,
|
|
110
|
+
aspect=ContainerClass(container=database_key.as_urn()),
|
|
111
|
+
).as_workunit()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_procedure_flow_name(
|
|
115
|
+
database_key: DatabaseKey, schema_key: Optional[SchemaKey]
|
|
116
|
+
) -> str:
|
|
117
|
+
if schema_key:
|
|
118
|
+
procedure_flow_name = (
|
|
119
|
+
f"{schema_key.database}.{schema_key.db_schema}.stored_procedures"
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
procedure_flow_name = f"{database_key.database}.stored_procedures"
|
|
123
|
+
return procedure_flow_name
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _generate_job_workunits(
|
|
127
|
+
database_key: DatabaseKey,
|
|
128
|
+
schema_key: Optional[SchemaKey],
|
|
129
|
+
procedure: BaseProcedure,
|
|
130
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
131
|
+
"""Generate job workunits for database, schema and procedure"""
|
|
132
|
+
|
|
133
|
+
job_urn = procedure.to_urn(database_key, schema_key)
|
|
134
|
+
|
|
135
|
+
yield MetadataChangeProposalWrapper(
|
|
136
|
+
entityUrn=job_urn,
|
|
137
|
+
aspect=DataJobInfoClass(
|
|
138
|
+
name=procedure.name,
|
|
139
|
+
type=JobContainerSubTypes.STORED_PROCEDURE,
|
|
140
|
+
description=procedure.comment,
|
|
141
|
+
customProperties=procedure.extra_properties,
|
|
142
|
+
),
|
|
143
|
+
).as_workunit()
|
|
144
|
+
|
|
145
|
+
yield MetadataChangeProposalWrapper(
|
|
146
|
+
entityUrn=job_urn,
|
|
147
|
+
aspect=SubTypesClass(
|
|
148
|
+
typeNames=[JobContainerSubTypes.STORED_PROCEDURE],
|
|
149
|
+
),
|
|
150
|
+
).as_workunit()
|
|
151
|
+
|
|
152
|
+
if database_key.instance:
|
|
153
|
+
yield MetadataChangeProposalWrapper(
|
|
154
|
+
entityUrn=job_urn,
|
|
155
|
+
aspect=DataPlatformInstanceClass(
|
|
156
|
+
platform=make_data_platform_urn(database_key.platform),
|
|
157
|
+
instance=make_dataplatform_instance_urn(
|
|
158
|
+
platform=database_key.platform,
|
|
159
|
+
instance=database_key.instance,
|
|
160
|
+
),
|
|
161
|
+
),
|
|
162
|
+
).as_workunit()
|
|
163
|
+
|
|
164
|
+
container_key = schema_key or database_key # database_key for 2-tier
|
|
165
|
+
yield MetadataChangeProposalWrapper(
|
|
166
|
+
entityUrn=job_urn,
|
|
167
|
+
aspect=ContainerClass(container=container_key.as_urn()),
|
|
168
|
+
).as_workunit()
|
|
169
|
+
|
|
170
|
+
# TODO: Config whether to ingest procedure code
|
|
171
|
+
if procedure.procedure_definition:
|
|
172
|
+
yield MetadataChangeProposalWrapper(
|
|
173
|
+
entityUrn=job_urn,
|
|
174
|
+
aspect=DataTransformLogicClass(
|
|
175
|
+
transforms=[
|
|
176
|
+
DataTransformClass(
|
|
177
|
+
queryStatement=QueryStatementClass(
|
|
178
|
+
value=procedure.procedure_definition,
|
|
179
|
+
language=procedure.language,
|
|
180
|
+
),
|
|
181
|
+
)
|
|
182
|
+
]
|
|
183
|
+
),
|
|
184
|
+
).as_workunit()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def generate_procedure_lineage(
|
|
188
|
+
*,
|
|
189
|
+
schema_resolver: SchemaResolver,
|
|
190
|
+
procedure: BaseProcedure,
|
|
191
|
+
procedure_job_urn: str,
|
|
192
|
+
default_db: Optional[str] = None,
|
|
193
|
+
default_schema: Optional[str] = None,
|
|
194
|
+
is_temp_table: Callable[[str], bool] = lambda _: False,
|
|
195
|
+
raise_: bool = False,
|
|
196
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
197
|
+
if procedure.procedure_definition and procedure.language == "SQL":
|
|
198
|
+
datajob_input_output = parse_procedure_code(
|
|
199
|
+
schema_resolver=schema_resolver,
|
|
200
|
+
default_db=default_db,
|
|
201
|
+
default_schema=default_schema,
|
|
202
|
+
code=procedure.procedure_definition,
|
|
203
|
+
is_temp_table=is_temp_table,
|
|
204
|
+
raise_=raise_,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if datajob_input_output:
|
|
208
|
+
yield MetadataChangeProposalWrapper(
|
|
209
|
+
entityUrn=procedure_job_urn,
|
|
210
|
+
aspect=datajob_input_output,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def generate_procedure_container_workunits(
|
|
215
|
+
database_key: DatabaseKey,
|
|
216
|
+
schema_key: Optional[SchemaKey],
|
|
217
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
218
|
+
"""Generate container workunits for database and schema"""
|
|
219
|
+
|
|
220
|
+
yield from _generate_flow_workunits(database_key, schema_key)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def generate_procedure_workunits(
|
|
224
|
+
procedure: BaseProcedure,
|
|
225
|
+
database_key: DatabaseKey,
|
|
226
|
+
schema_key: Optional[SchemaKey],
|
|
227
|
+
schema_resolver: Optional[SchemaResolver],
|
|
228
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
229
|
+
yield from _generate_job_workunits(database_key, schema_key, procedure)
|
|
230
|
+
|
|
231
|
+
if schema_resolver:
|
|
232
|
+
job_urn = procedure.to_urn(database_key, schema_key)
|
|
233
|
+
|
|
234
|
+
yield from auto_workunit(
|
|
235
|
+
generate_procedure_lineage(
|
|
236
|
+
schema_resolver=schema_resolver,
|
|
237
|
+
procedure=procedure,
|
|
238
|
+
procedure_job_urn=job_urn,
|
|
239
|
+
default_db=database_key.database,
|
|
240
|
+
default_schema=schema_key.db_schema if schema_key else None,
|
|
241
|
+
)
|
|
242
|
+
)
|
datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py}
RENAMED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Callable,
|
|
2
|
+
from typing import Callable, Optional
|
|
3
3
|
|
|
4
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
-
from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
|
|
6
4
|
from datahub.metadata.schema_classes import DataJobInputOutputClass
|
|
7
5
|
from datahub.sql_parsing.datajob import to_datajob_input_output
|
|
8
6
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -56,29 +54,3 @@ def parse_procedure_code(
|
|
|
56
54
|
mcps=mcps,
|
|
57
55
|
ignore_extra_mcps=True,
|
|
58
56
|
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
# Is procedure handling generic enough to be added to SqlParsingAggregator?
|
|
62
|
-
def generate_procedure_lineage(
|
|
63
|
-
*,
|
|
64
|
-
schema_resolver: SchemaResolver,
|
|
65
|
-
procedure: StoredProcedure,
|
|
66
|
-
procedure_job_urn: str,
|
|
67
|
-
is_temp_table: Callable[[str], bool] = lambda _: False,
|
|
68
|
-
raise_: bool = False,
|
|
69
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
70
|
-
if procedure.code:
|
|
71
|
-
datajob_input_output = parse_procedure_code(
|
|
72
|
-
schema_resolver=schema_resolver,
|
|
73
|
-
default_db=procedure.db,
|
|
74
|
-
default_schema=procedure.schema,
|
|
75
|
-
code=procedure.code,
|
|
76
|
-
is_temp_table=is_temp_table,
|
|
77
|
-
raise_=raise_,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
if datajob_input_output:
|
|
81
|
-
yield MetadataChangeProposalWrapper(
|
|
82
|
-
entityUrn=procedure_job_urn,
|
|
83
|
-
aspect=datajob_input_output,
|
|
84
|
-
)
|
|
@@ -107,7 +107,6 @@ class ContainerKeyWithId(ContainerKey):
|
|
|
107
107
|
SourceCapability.DESCRIPTIONS,
|
|
108
108
|
"Extract descriptions for Vertex AI Registered Models and Model Versions",
|
|
109
109
|
)
|
|
110
|
-
@capability(SourceCapability.TAGS, "Extract tags for Vertex AI Registered Model Stages")
|
|
111
110
|
class VertexAISource(Source):
|
|
112
111
|
platform: str = "vertexai"
|
|
113
112
|
|
|
@@ -602,6 +601,7 @@ class VertexAISource(Source):
|
|
|
602
601
|
else None
|
|
603
602
|
),
|
|
604
603
|
customProperties=None,
|
|
604
|
+
externalUrl=self._make_model_external_url(model),
|
|
605
605
|
),
|
|
606
606
|
SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_MODEL_GROUP]),
|
|
607
607
|
ContainerClass(container=self._get_project_container().as_urn()),
|