acryl-datahub 1.0.0.1rc2__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (37) hide show
  1. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/METADATA +2569 -2569
  2. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/RECORD +37 -35
  3. datahub/_version.py +1 -1
  4. datahub/emitter/rest_emitter.py +2 -2
  5. datahub/ingestion/graph/client.py +6 -11
  6. datahub/ingestion/graph/filters.py +22 -2
  7. datahub/ingestion/source/common/subtypes.py +1 -1
  8. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  9. datahub/ingestion/source/ge_data_profiler.py +11 -1
  10. datahub/ingestion/source/mlflow.py +19 -1
  11. datahub/ingestion/source/redshift/lineage_v2.py +7 -0
  12. datahub/ingestion/source/redshift/query.py +1 -1
  13. datahub/ingestion/source/snowflake/constants.py +1 -0
  14. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  15. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  16. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  17. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  18. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  19. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  20. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  21. datahub/ingestion/source/sql/mssql/source.py +8 -4
  22. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  23. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  24. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  25. datahub/ingestion/source/superset.py +153 -13
  26. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  27. datahub/metadata/schema.avsc +2 -0
  28. datahub/metadata/schemas/Deprecation.avsc +2 -0
  29. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  30. datahub/sdk/__init__.py +1 -0
  31. datahub/sdk/main_client.py +2 -1
  32. datahub/sdk/search_filters.py +18 -23
  33. datahub/sql_parsing/split_statements.py +17 -3
  34. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/WHEEL +0 -0
  35. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/licenses/LICENSE +0 -0
  37. {acryl_datahub-1.0.0.1rc2.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/top_level.txt +0 -0
@@ -41,6 +41,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
41
41
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
42
42
  from datahub.ingestion.source.snowflake.snowflake_schema import (
43
43
  SCHEMA_PARALLELISM,
44
+ BaseProcedure,
44
45
  SnowflakeColumn,
45
46
  SnowflakeDatabase,
46
47
  SnowflakeDataDictionary,
@@ -63,12 +64,14 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
63
64
  from datahub.ingestion.source.sql.sql_utils import (
64
65
  add_table_to_schema_container,
65
66
  gen_database_container,
66
- gen_database_key,
67
67
  gen_schema_container,
68
- gen_schema_key,
69
68
  get_dataplatform_instance_aspect,
70
69
  get_domain_wu,
71
70
  )
71
+ from datahub.ingestion.source.sql.stored_procedures.base import (
72
+ generate_procedure_container_workunits,
73
+ generate_procedure_workunits,
74
+ )
72
75
  from datahub.ingestion.source_report.ingestion_stage import (
73
76
  EXTERNAL_TABLE_DDL_LINEAGE,
74
77
  LINEAGE_EXTRACTION,
@@ -448,10 +451,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
448
451
  if self.config.include_streams:
449
452
  self.report.num_get_streams_for_schema_queries += 1
450
453
  streams = self.fetch_streams_for_schema(
451
- snowflake_schema, db_name, schema_name
454
+ snowflake_schema,
455
+ db_name,
452
456
  )
453
457
  yield from self._process_streams(streams, snowflake_schema, db_name)
454
458
 
459
+ if self.config.include_procedures:
460
+ procedures = self.fetch_procedures_for_schema(snowflake_schema, db_name)
461
+ yield from self._process_procedures(procedures, snowflake_schema, db_name)
462
+
455
463
  if self.config.include_technical_schema and snowflake_schema.tags:
456
464
  yield from self._process_tags_in_schema(snowflake_schema)
457
465
 
@@ -536,6 +544,26 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
536
544
  for stream in streams:
537
545
  yield from self._process_stream(stream, snowflake_schema, db_name)
538
546
 
547
+ def _process_procedures(
548
+ self,
549
+ procedures: List[BaseProcedure],
550
+ snowflake_schema: SnowflakeSchema,
551
+ db_name: str,
552
+ ) -> Iterable[MetadataWorkUnit]:
553
+ if self.config.include_technical_schema:
554
+ if procedures:
555
+ yield from generate_procedure_container_workunits(
556
+ self.identifiers.gen_database_key(
557
+ db_name,
558
+ ),
559
+ self.identifiers.gen_schema_key(
560
+ db_name=db_name,
561
+ schema_name=snowflake_schema.name,
562
+ ),
563
+ )
564
+ for procedure in procedures:
565
+ yield from self._process_procedure(procedure, snowflake_schema, db_name)
566
+
539
567
  def _process_tags_in_schema(
540
568
  self, snowflake_schema: SnowflakeSchema
541
569
  ) -> Iterable[MetadataWorkUnit]:
@@ -819,13 +847,7 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
819
847
  entityUrn=dataset_urn, aspect=dataset_properties
820
848
  ).as_workunit()
821
849
 
822
- schema_container_key = gen_schema_key(
823
- db_name=self.snowflake_identifier(db_name),
824
- schema=self.snowflake_identifier(schema_name),
825
- platform=self.platform,
826
- platform_instance=self.config.platform_instance,
827
- env=self.config.env,
828
- )
850
+ schema_container_key = self.identifiers.gen_schema_key(db_name, schema_name)
829
851
 
830
852
  if self.config.extract_tags_as_structured_properties:
831
853
  yield from self.gen_column_tags_as_structured_properties(dataset_urn, table)
@@ -1094,11 +1116,8 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1094
1116
  def gen_database_containers(
1095
1117
  self, database: SnowflakeDatabase
1096
1118
  ) -> Iterable[MetadataWorkUnit]:
1097
- database_container_key = gen_database_key(
1098
- self.snowflake_identifier(database.name),
1099
- platform=self.platform,
1100
- platform_instance=self.config.platform_instance,
1101
- env=self.config.env,
1119
+ database_container_key = self.identifiers.gen_database_key(
1120
+ database.name,
1102
1121
  )
1103
1122
 
1104
1123
  yield from gen_database_container(
@@ -1147,21 +1166,9 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1147
1166
  def gen_schema_containers(
1148
1167
  self, schema: SnowflakeSchema, db_name: str
1149
1168
  ) -> Iterable[MetadataWorkUnit]:
1150
- schema_name = self.snowflake_identifier(schema.name)
1151
- database_container_key = gen_database_key(
1152
- database=self.snowflake_identifier(db_name),
1153
- platform=self.platform,
1154
- platform_instance=self.config.platform_instance,
1155
- env=self.config.env,
1156
- )
1169
+ database_container_key = self.identifiers.gen_database_key(db_name)
1157
1170
 
1158
- schema_container_key = gen_schema_key(
1159
- db_name=self.snowflake_identifier(db_name),
1160
- schema=schema_name,
1161
- platform=self.platform,
1162
- platform_instance=self.config.platform_instance,
1163
- env=self.config.env,
1164
- )
1171
+ schema_container_key = self.identifiers.gen_schema_key(db_name, schema.name)
1165
1172
 
1166
1173
  yield from gen_schema_container(
1167
1174
  name=schema.name,
@@ -1290,13 +1297,13 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1290
1297
  )
1291
1298
 
1292
1299
  def fetch_streams_for_schema(
1293
- self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str
1300
+ self, snowflake_schema: SnowflakeSchema, db_name: str
1294
1301
  ) -> List[SnowflakeStream]:
1295
1302
  try:
1296
1303
  streams: List[SnowflakeStream] = []
1297
- for stream in self.get_streams_for_schema(schema_name, db_name):
1304
+ for stream in self.get_streams_for_schema(snowflake_schema.name, db_name):
1298
1305
  stream_identifier = self.identifiers.get_dataset_identifier(
1299
- stream.name, schema_name, db_name
1306
+ stream.name, snowflake_schema.name, db_name
1300
1307
  )
1301
1308
 
1302
1309
  self.report.report_entity_scanned(stream_identifier, "stream")
@@ -1310,16 +1317,15 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1310
1317
  snowflake_schema.streams = [stream.name for stream in streams]
1311
1318
  return streams
1312
1319
  except Exception as e:
1313
- if isinstance(e, SnowflakePermissionError):
1314
- error_msg = f"Failed to get streams for schema {db_name}.{schema_name}. Please check permissions."
1315
- raise SnowflakePermissionError(error_msg) from e.__cause__
1316
- else:
1317
- self.structured_reporter.warning(
1318
- "Failed to get streams for schema",
1319
- f"{db_name}.{schema_name}",
1320
- exc=e,
1321
- )
1322
- return []
1320
+ self.structured_reporter.warning(
1321
+ title="Failed to get streams for schema",
1322
+ message="Please check permissions"
1323
+ if isinstance(e, SnowflakePermissionError)
1324
+ else "",
1325
+ context=f"{db_name}.{snowflake_schema.name}",
1326
+ exc=e,
1327
+ )
1328
+ return []
1323
1329
 
1324
1330
  def get_streams_for_schema(
1325
1331
  self, schema_name: str, db_name: str
@@ -1328,6 +1334,42 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1328
1334
 
1329
1335
  return streams.get(schema_name, [])
1330
1336
 
1337
+ def fetch_procedures_for_schema(
1338
+ self, snowflake_schema: SnowflakeSchema, db_name: str
1339
+ ) -> List[BaseProcedure]:
1340
+ try:
1341
+ procedures: List[BaseProcedure] = []
1342
+ for procedure in self.get_procedures_for_schema(snowflake_schema, db_name):
1343
+ procedure_qualified_name = self.identifiers.get_dataset_identifier(
1344
+ procedure.name, snowflake_schema.name, db_name
1345
+ )
1346
+ self.report.report_entity_scanned(procedure_qualified_name, "procedure")
1347
+
1348
+ if self.filters.is_procedure_allowed(procedure_qualified_name):
1349
+ procedures.append(procedure)
1350
+ else:
1351
+ self.report.report_dropped(procedure_qualified_name)
1352
+ return procedures
1353
+ except Exception as e:
1354
+ self.structured_reporter.warning(
1355
+ title="Failed to get procedures for schema",
1356
+ message="Please check permissions"
1357
+ if isinstance(e, SnowflakePermissionError)
1358
+ else "",
1359
+ context=f"{db_name}.{snowflake_schema.name}",
1360
+ exc=e,
1361
+ )
1362
+ return []
1363
+
1364
+ def get_procedures_for_schema(
1365
+ self,
1366
+ snowflake_schema: SnowflakeSchema,
1367
+ db_name: str,
1368
+ ) -> List[BaseProcedure]:
1369
+ procedures = self.data_dictionary.get_procedures_for_database(db_name)
1370
+
1371
+ return procedures.get(snowflake_schema.name, [])
1372
+
1331
1373
  def _process_stream(
1332
1374
  self,
1333
1375
  stream: SnowflakeStream,
@@ -1350,6 +1392,34 @@ class SnowflakeSchemaGenerator(SnowflakeStructuredReportMixin):
1350
1392
  "Failed to get columns for stream:", stream.name, exc=e
1351
1393
  )
1352
1394
 
1395
+ def _process_procedure(
1396
+ self,
1397
+ procedure: BaseProcedure,
1398
+ snowflake_schema: SnowflakeSchema,
1399
+ db_name: str,
1400
+ ) -> Iterable[MetadataWorkUnit]:
1401
+ try:
1402
+ # TODO: For CLL, we should process procedures after all tables are processed
1403
+ yield from generate_procedure_workunits(
1404
+ procedure,
1405
+ database_key=self.identifiers.gen_database_key(
1406
+ db_name,
1407
+ ),
1408
+ schema_key=self.identifiers.gen_schema_key(
1409
+ db_name, snowflake_schema.name
1410
+ ),
1411
+ schema_resolver=(
1412
+ self.aggregator._schema_resolver if self.aggregator else None
1413
+ ),
1414
+ )
1415
+ except Exception as e:
1416
+ self.structured_reporter.warning(
1417
+ title="Failed to ingest stored procedure",
1418
+ message="",
1419
+ context=procedure.name,
1420
+ exc=e,
1421
+ )
1422
+
1353
1423
  def get_columns_for_stream(
1354
1424
  self,
1355
1425
  source_object: str, # Qualified name of source table/view
@@ -3,7 +3,10 @@ from functools import cached_property
3
3
  from typing import ClassVar, List, Literal, Optional, Tuple
4
4
 
5
5
  from datahub.configuration.pattern_utils import is_schema_allowed
6
- from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
6
+ from datahub.emitter.mce_builder import (
7
+ make_dataset_urn_with_platform_instance,
8
+ )
9
+ from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
7
10
  from datahub.ingestion.api.source import SourceReport
8
11
  from datahub.ingestion.source.snowflake.constants import (
9
12
  SNOWFLAKE_REGION_CLOUD_REGION_MAPPING,
@@ -16,6 +19,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
16
19
  SnowflakeV2Config,
17
20
  )
18
21
  from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
22
+ from datahub.ingestion.source.sql.sql_utils import gen_database_key, gen_schema_key
19
23
 
20
24
 
21
25
  class SnowflakeStructuredReportMixin(abc.ABC):
@@ -180,6 +184,9 @@ class SnowflakeFilter:
180
184
 
181
185
  return True
182
186
 
187
+ def is_procedure_allowed(self, procedure_name: str) -> bool:
188
+ return self.filter_config.procedure_pattern.allowed(procedure_name)
189
+
183
190
 
184
191
  def _combine_identifier_parts(
185
192
  *, table_name: str, schema_name: str, db_name: str
@@ -330,6 +337,23 @@ class SnowflakeIdentifierBuilder:
330
337
  else user_name
331
338
  )
332
339
 
340
+ def gen_schema_key(self, db_name: str, schema_name: str) -> SchemaKey:
341
+ return gen_schema_key(
342
+ db_name=self.snowflake_identifier(db_name),
343
+ schema=self.snowflake_identifier(schema_name),
344
+ platform=self.platform,
345
+ platform_instance=self.identifier_config.platform_instance,
346
+ env=self.identifier_config.env,
347
+ )
348
+
349
+ def gen_database_key(self, db_name: str) -> DatabaseKey:
350
+ return gen_database_key(
351
+ database=self.snowflake_identifier(db_name),
352
+ platform=self.platform,
353
+ platform_instance=self.identifier_config.platform_instance,
354
+ env=self.identifier_config.env,
355
+ )
356
+
333
357
 
334
358
  class SnowflakeCommonMixin(SnowflakeStructuredReportMixin):
335
359
  platform = "snowflake"
@@ -15,6 +15,7 @@ from datahub.ingestion.source.common.subtypes import (
15
15
  FlowContainerSubTypes,
16
16
  JobContainerSubTypes,
17
17
  )
18
+ from datahub.ingestion.source.sql.stored_procedures.base import BaseProcedure
18
19
  from datahub.metadata.schema_classes import (
19
20
  ContainerClass,
20
21
  DataFlowInfoClass,
@@ -135,6 +136,19 @@ class StoredProcedure:
135
136
  def escape_full_name(self) -> str:
136
137
  return f"[{self.db}].[{self.schema}].[{self.formatted_name}]"
137
138
 
139
+ def to_base_procedure(self) -> BaseProcedure:
140
+ return BaseProcedure(
141
+ name=self.formatted_name,
142
+ procedure_definition=self.code,
143
+ created=None,
144
+ last_altered=None,
145
+ comment=None,
146
+ argument_signature=None,
147
+ return_type=None,
148
+ language="SQL",
149
+ extra_properties=None,
150
+ )
151
+
138
152
 
139
153
  @dataclass
140
154
  class JobStep:
@@ -222,7 +236,7 @@ class MSSQLDataJob:
222
236
  type = (
223
237
  JobContainerSubTypes.MSSQL_JOBSTEP
224
238
  if isinstance(self.entity, JobStep)
225
- else JobContainerSubTypes.MSSQL_STORED_PROCEDURE
239
+ else JobContainerSubTypes.STORED_PROCEDURE
226
240
  )
227
241
  return SubTypesClass(
228
242
  typeNames=[type],
@@ -37,9 +37,6 @@ from datahub.ingestion.source.sql.mssql.job_models import (
37
37
  ProcedureParameter,
38
38
  StoredProcedure,
39
39
  )
40
- from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
41
- generate_procedure_lineage,
42
- )
43
40
  from datahub.ingestion.source.sql.sql_common import (
44
41
  SQLAlchemySource,
45
42
  SqlWorkUnit,
@@ -50,6 +47,9 @@ from datahub.ingestion.source.sql.sql_config import (
50
47
  make_sqlalchemy_uri,
51
48
  )
52
49
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
50
+ from datahub.ingestion.source.sql.stored_procedures.base import (
51
+ generate_procedure_lineage,
52
+ )
53
53
  from datahub.utilities.file_backed_collections import FileBackedList
54
54
 
55
55
  logger: logging.Logger = logging.getLogger(__name__)
@@ -65,6 +65,8 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
65
65
  # defaults
66
66
  host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
67
67
  scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
68
+
69
+ # TODO: rename to include_procedures ?
68
70
  include_stored_procedures: bool = Field(
69
71
  default=True,
70
72
  description="Include ingest of stored procedures. Requires access to the 'sys' schema.",
@@ -763,9 +765,11 @@ class SQLServerSource(SQLAlchemySource):
763
765
  yield from auto_workunit(
764
766
  generate_procedure_lineage(
765
767
  schema_resolver=self.get_schema_resolver(),
766
- procedure=procedure,
768
+ procedure=procedure.to_base_procedure(),
767
769
  procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
768
770
  is_temp_table=self.is_temp_table,
771
+ default_db=procedure.db,
772
+ default_schema=procedure.schema,
769
773
  )
770
774
  )
771
775
 
@@ -0,0 +1,242 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import Callable, Dict, Iterable, Optional
4
+
5
+ from datahub.emitter.mce_builder import (
6
+ DEFAULT_ENV,
7
+ datahub_guid,
8
+ make_data_flow_urn,
9
+ make_data_job_urn,
10
+ make_data_platform_urn,
11
+ make_dataplatform_instance_urn,
12
+ )
13
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
14
+ from datahub.emitter.mcp_builder import DatabaseKey, SchemaKey
15
+ from datahub.ingestion.api.source_helpers import auto_workunit
16
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+ from datahub.ingestion.source.common.subtypes import (
18
+ FlowContainerSubTypes,
19
+ JobContainerSubTypes,
20
+ )
21
+ from datahub.ingestion.source.sql.stored_procedures.lineage import parse_procedure_code
22
+ from datahub.metadata.schema_classes import (
23
+ ContainerClass,
24
+ DataFlowInfoClass,
25
+ DataJobInfoClass,
26
+ DataPlatformInstanceClass,
27
+ DataTransformClass,
28
+ DataTransformLogicClass,
29
+ QueryStatementClass,
30
+ SubTypesClass,
31
+ )
32
+ from datahub.sql_parsing.schema_resolver import SchemaResolver
33
+
34
+
35
+ @dataclass
36
+ class BaseProcedure:
37
+ name: str
38
+ procedure_definition: Optional[str]
39
+ created: Optional[datetime]
40
+ last_altered: Optional[datetime]
41
+ comment: Optional[str]
42
+ argument_signature: Optional[str]
43
+ return_type: Optional[str]
44
+ language: str
45
+ extra_properties: Optional[Dict[str, str]]
46
+
47
+ def get_procedure_identifier(
48
+ self,
49
+ ) -> str:
50
+ if self.argument_signature:
51
+ argument_signature_hash = datahub_guid(
52
+ dict(argument_signature=self.argument_signature)
53
+ )
54
+ return f"{self.name}_{argument_signature_hash}"
55
+
56
+ return self.name
57
+
58
+ def to_urn(self, database_key: DatabaseKey, schema_key: Optional[SchemaKey]) -> str:
59
+ return make_data_job_urn(
60
+ orchestrator=database_key.platform,
61
+ flow_id=_get_procedure_flow_name(database_key, schema_key),
62
+ job_id=self.get_procedure_identifier(),
63
+ cluster=database_key.env or DEFAULT_ENV,
64
+ platform_instance=database_key.instance,
65
+ )
66
+
67
+
68
+ def _generate_flow_workunits(
69
+ database_key: DatabaseKey, schema_key: Optional[SchemaKey]
70
+ ) -> Iterable[MetadataWorkUnit]:
71
+ """Generate flow workunits for database and schema"""
72
+
73
+ procedure_flow_name = _get_procedure_flow_name(database_key, schema_key)
74
+
75
+ flow_urn = make_data_flow_urn(
76
+ orchestrator=database_key.platform,
77
+ flow_id=procedure_flow_name,
78
+ cluster=database_key.env or DEFAULT_ENV,
79
+ platform_instance=database_key.instance,
80
+ )
81
+
82
+ yield MetadataChangeProposalWrapper(
83
+ entityUrn=flow_urn,
84
+ aspect=DataFlowInfoClass(
85
+ name=procedure_flow_name,
86
+ ),
87
+ ).as_workunit()
88
+
89
+ yield MetadataChangeProposalWrapper(
90
+ entityUrn=flow_urn,
91
+ aspect=SubTypesClass(
92
+ typeNames=[FlowContainerSubTypes.MSSQL_PROCEDURE_CONTAINER],
93
+ ),
94
+ ).as_workunit()
95
+
96
+ if database_key.instance:
97
+ yield MetadataChangeProposalWrapper(
98
+ entityUrn=flow_urn,
99
+ aspect=DataPlatformInstanceClass(
100
+ platform=make_data_platform_urn(database_key.platform),
101
+ instance=make_dataplatform_instance_urn(
102
+ platform=database_key.platform,
103
+ instance=database_key.instance,
104
+ ),
105
+ ),
106
+ ).as_workunit()
107
+
108
+ yield MetadataChangeProposalWrapper(
109
+ entityUrn=flow_urn,
110
+ aspect=ContainerClass(container=database_key.as_urn()),
111
+ ).as_workunit()
112
+
113
+
114
+ def _get_procedure_flow_name(
115
+ database_key: DatabaseKey, schema_key: Optional[SchemaKey]
116
+ ) -> str:
117
+ if schema_key:
118
+ procedure_flow_name = (
119
+ f"{schema_key.database}.{schema_key.db_schema}.stored_procedures"
120
+ )
121
+ else:
122
+ procedure_flow_name = f"{database_key.database}.stored_procedures"
123
+ return procedure_flow_name
124
+
125
+
126
+ def _generate_job_workunits(
127
+ database_key: DatabaseKey,
128
+ schema_key: Optional[SchemaKey],
129
+ procedure: BaseProcedure,
130
+ ) -> Iterable[MetadataWorkUnit]:
131
+ """Generate job workunits for database, schema and procedure"""
132
+
133
+ job_urn = procedure.to_urn(database_key, schema_key)
134
+
135
+ yield MetadataChangeProposalWrapper(
136
+ entityUrn=job_urn,
137
+ aspect=DataJobInfoClass(
138
+ name=procedure.name,
139
+ type=JobContainerSubTypes.STORED_PROCEDURE,
140
+ description=procedure.comment,
141
+ customProperties=procedure.extra_properties,
142
+ ),
143
+ ).as_workunit()
144
+
145
+ yield MetadataChangeProposalWrapper(
146
+ entityUrn=job_urn,
147
+ aspect=SubTypesClass(
148
+ typeNames=[JobContainerSubTypes.STORED_PROCEDURE],
149
+ ),
150
+ ).as_workunit()
151
+
152
+ if database_key.instance:
153
+ yield MetadataChangeProposalWrapper(
154
+ entityUrn=job_urn,
155
+ aspect=DataPlatformInstanceClass(
156
+ platform=make_data_platform_urn(database_key.platform),
157
+ instance=make_dataplatform_instance_urn(
158
+ platform=database_key.platform,
159
+ instance=database_key.instance,
160
+ ),
161
+ ),
162
+ ).as_workunit()
163
+
164
+ container_key = schema_key or database_key # database_key for 2-tier
165
+ yield MetadataChangeProposalWrapper(
166
+ entityUrn=job_urn,
167
+ aspect=ContainerClass(container=container_key.as_urn()),
168
+ ).as_workunit()
169
+
170
+ # TODO: Config whether to ingest procedure code
171
+ if procedure.procedure_definition:
172
+ yield MetadataChangeProposalWrapper(
173
+ entityUrn=job_urn,
174
+ aspect=DataTransformLogicClass(
175
+ transforms=[
176
+ DataTransformClass(
177
+ queryStatement=QueryStatementClass(
178
+ value=procedure.procedure_definition,
179
+ language=procedure.language,
180
+ ),
181
+ )
182
+ ]
183
+ ),
184
+ ).as_workunit()
185
+
186
+
187
+ def generate_procedure_lineage(
188
+ *,
189
+ schema_resolver: SchemaResolver,
190
+ procedure: BaseProcedure,
191
+ procedure_job_urn: str,
192
+ default_db: Optional[str] = None,
193
+ default_schema: Optional[str] = None,
194
+ is_temp_table: Callable[[str], bool] = lambda _: False,
195
+ raise_: bool = False,
196
+ ) -> Iterable[MetadataChangeProposalWrapper]:
197
+ if procedure.procedure_definition and procedure.language == "SQL":
198
+ datajob_input_output = parse_procedure_code(
199
+ schema_resolver=schema_resolver,
200
+ default_db=default_db,
201
+ default_schema=default_schema,
202
+ code=procedure.procedure_definition,
203
+ is_temp_table=is_temp_table,
204
+ raise_=raise_,
205
+ )
206
+
207
+ if datajob_input_output:
208
+ yield MetadataChangeProposalWrapper(
209
+ entityUrn=procedure_job_urn,
210
+ aspect=datajob_input_output,
211
+ )
212
+
213
+
214
+ def generate_procedure_container_workunits(
215
+ database_key: DatabaseKey,
216
+ schema_key: Optional[SchemaKey],
217
+ ) -> Iterable[MetadataWorkUnit]:
218
+ """Generate container workunits for database and schema"""
219
+
220
+ yield from _generate_flow_workunits(database_key, schema_key)
221
+
222
+
223
+ def generate_procedure_workunits(
224
+ procedure: BaseProcedure,
225
+ database_key: DatabaseKey,
226
+ schema_key: Optional[SchemaKey],
227
+ schema_resolver: Optional[SchemaResolver],
228
+ ) -> Iterable[MetadataWorkUnit]:
229
+ yield from _generate_job_workunits(database_key, schema_key, procedure)
230
+
231
+ if schema_resolver:
232
+ job_urn = procedure.to_urn(database_key, schema_key)
233
+
234
+ yield from auto_workunit(
235
+ generate_procedure_lineage(
236
+ schema_resolver=schema_resolver,
237
+ procedure=procedure,
238
+ procedure_job_urn=job_urn,
239
+ default_db=database_key.database,
240
+ default_schema=schema_key.db_schema if schema_key else None,
241
+ )
242
+ )
@@ -1,8 +1,6 @@
1
1
  import logging
2
- from typing import Callable, Iterable, Optional
2
+ from typing import Callable, Optional
3
3
 
4
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
- from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
6
4
  from datahub.metadata.schema_classes import DataJobInputOutputClass
7
5
  from datahub.sql_parsing.datajob import to_datajob_input_output
8
6
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -56,29 +54,3 @@ def parse_procedure_code(
56
54
  mcps=mcps,
57
55
  ignore_extra_mcps=True,
58
56
  )
59
-
60
-
61
- # Is procedure handling generic enough to be added to SqlParsingAggregator?
62
- def generate_procedure_lineage(
63
- *,
64
- schema_resolver: SchemaResolver,
65
- procedure: StoredProcedure,
66
- procedure_job_urn: str,
67
- is_temp_table: Callable[[str], bool] = lambda _: False,
68
- raise_: bool = False,
69
- ) -> Iterable[MetadataChangeProposalWrapper]:
70
- if procedure.code:
71
- datajob_input_output = parse_procedure_code(
72
- schema_resolver=schema_resolver,
73
- default_db=procedure.db,
74
- default_schema=procedure.schema,
75
- code=procedure.code,
76
- is_temp_table=is_temp_table,
77
- raise_=raise_,
78
- )
79
-
80
- if datajob_input_output:
81
- yield MetadataChangeProposalWrapper(
82
- entityUrn=procedure_job_urn,
83
- aspect=datajob_input_output,
84
- )