acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=WMIoTkprsOt2SjEA2gbvdoJ3VkTIeWCVPilQAqNdCQw,577
1
+ datahub/__init__.py,sha256=sacWO6qm2tPLBhc25GFoFnt_AeiT0Qk9ZiibpHJbPhQ,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -449,7 +449,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
449
449
  datahub/ingestion/source/sql/oracle.py,sha256=exJJvMSC42Oj0IiFtpAPFatWkRu9mVu8sd7ofOdanqU,22460
450
450
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
451
451
  datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
452
- datahub/ingestion/source/sql/sql_common.py,sha256=SZkPK7aUiTdMHf5alG5724i1pyAn1jFUjcuDkKUoeL0,51813
452
+ datahub/ingestion/source/sql/sql_common.py,sha256=Qe481Pct1vfl-NVpgOufSEaNSMgJGD3bq1DPvjIkPTg,52164
453
453
  datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
454
454
  datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
455
455
  datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=vWna43rv9ClfK9MBNTx8tdoUr35nD5d8ppgeamEEhSQ,12528
@@ -462,8 +462,9 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
462
462
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
463
463
  datahub/ingestion/source/sql/vertica.py,sha256=pkx-1JDBmow7WhoEIEh0SLj7kff9L0zUOHDytoC43gk,33339
464
464
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
465
- datahub/ingestion/source/sql/mssql/job_models.py,sha256=-wDCltoBAFHaydAV4Gs0vm-j7qKNCH5iEJOVWanhlZw,5981
466
- datahub/ingestion/source/sql/mssql/source.py,sha256=_qenSWn5mPrr5EPDB6Qbt9EOURqesK68Fm_pnh9sO58,26594
465
+ datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
466
+ datahub/ingestion/source/sql/mssql/source.py,sha256=fzpWjwexGvJgpd6Z4DCsK6Ld2vQCfPkD2M1xE4pU9Ec,29542
467
+ datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
467
468
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
468
469
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
469
470
  datahub/ingestion/source/state/entity_removal_state.py,sha256=zvIsmYg7oiIu2FhecU0VfLBNToUqvKoKyDeiFfkOcyc,6611
@@ -858,9 +859,11 @@ datahub/specific/structured_property.py,sha256=IYeFyafPidNrDbn1sU65rEPwIZDS-wLY1
858
859
  datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
859
860
  datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
860
861
  datahub/sql_parsing/_sqlglot_patch.py,sha256=iYJ8zOThHqqbamD5jdNr9iHTWD7ewNeHzPiTb6-rO3Y,7043
861
- datahub/sql_parsing/query_types.py,sha256=AxoWZiLuXM6iZ2AXdSlD82gqPNr1ZFh78wn2MSTu2wY,2479
862
+ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
863
+ datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
862
864
  datahub/sql_parsing/schema_resolver.py,sha256=9wbJT80K4nsIHHOuQLso9QoFQAYwfSJZnqHwsaU3UTY,10197
863
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=YzYmWNQjXuQYcfh91ZnqjuzCDWdmZpE-ZX14Kz6XFMs,69938
865
+ datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
866
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=gLelf5l73EufB8qijb9ZDLANkt4o05schGg4DY-bOJs,69937
864
867
  datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
865
868
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
866
869
  datahub/sql_parsing/sqlglot_lineage.py,sha256=K532_zvj7vEt2Ci2d3kxYZLZkq4S0ECBRZ2nHuY11a4,45927
@@ -967,8 +970,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
967
970
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
968
971
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
969
972
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
970
- acryl_datahub-0.14.1.13rc4.dist-info/METADATA,sha256=pKvm7N_prnqdEylMVyCtB6aReDQm2T-0LmvkhiwUdrU,171138
971
- acryl_datahub-0.14.1.13rc4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
972
- acryl_datahub-0.14.1.13rc4.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
973
- acryl_datahub-0.14.1.13rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
974
- acryl_datahub-0.14.1.13rc4.dist-info/RECORD,,
973
+ acryl_datahub-0.14.1.13rc5.dist-info/METADATA,sha256=9ZZBowQ_PYKf6d4V6EHqTTlJDY77BqxSX8uKfiIqFT4,171138
974
+ acryl_datahub-0.14.1.13rc5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
975
+ acryl_datahub-0.14.1.13rc5.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
976
+ acryl_datahub-0.14.1.13rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
977
+ acryl_datahub-0.14.1.13rc5.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.14.1.13rc4"
6
+ __version__ = "0.14.1.13rc5"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -101,6 +101,7 @@ class StoredProcedure:
101
101
  flow: Union[MSSQLJob, MSSQLProceduresContainer]
102
102
  type: str = "STORED_PROCEDURE"
103
103
  source: str = "mssql"
104
+ code: Optional[str] = None
104
105
 
105
106
  @property
106
107
  def full_type(self) -> str:
@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
24
24
  platform_name,
25
25
  support_status,
26
26
  )
27
+ from datahub.ingestion.api.source import StructuredLogLevel
28
+ from datahub.ingestion.api.source_helpers import auto_workunit
27
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
30
  from datahub.ingestion.source.sql.mssql.job_models import (
29
31
  JobStep,
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
36
38
  ProcedureParameter,
37
39
  StoredProcedure,
38
40
  )
41
+ from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
42
+ generate_procedure_lineage,
43
+ )
39
44
  from datahub.ingestion.source.sql.sql_common import (
40
45
  SQLAlchemySource,
41
46
  SqlWorkUnit,
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
51
56
  StringTypeClass,
52
57
  UnionTypeClass,
53
58
  )
59
+ from datahub.utilities.file_backed_collections import FileBackedList
54
60
 
55
61
  logger: logging.Logger = logging.getLogger(__name__)
56
62
 
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
99
105
  default=False,
100
106
  description="Enable to convert the SQL Server assets urns to lowercase",
101
107
  )
108
+ include_lineage: bool = Field(
109
+ default=True,
110
+ description="Enable lineage extraction for stored procedures",
111
+ )
102
112
 
103
113
  @pydantic.validator("uri_args")
104
114
  def passwords_match(cls, v, values, **kwargs):
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
161
171
  self.current_database = None
162
172
  self.table_descriptions: Dict[str, str] = {}
163
173
  self.column_descriptions: Dict[str, str] = {}
174
+ self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
164
175
  if self.config.include_descriptions:
165
176
  for inspector in self.get_inspectors():
166
177
  db_name: str = self.get_db_name(inspector)
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
374
385
  def loop_job_steps(
375
386
  self, job: MSSQLJob, job_steps: Dict[str, Any]
376
387
  ) -> Iterable[MetadataWorkUnit]:
377
- for step_id, step_data in job_steps.items():
388
+ for _step_id, step_data in job_steps.items():
378
389
  step = JobStep(
379
390
  job_name=job.formatted_name,
380
391
  step_name=step_data["step_name"],
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
412
423
  if procedures:
413
424
  yield from self.construct_flow_workunits(data_flow=data_flow)
414
425
  for procedure in procedures:
415
- upstream = self._get_procedure_upstream(conn, procedure)
416
- downstream = self._get_procedure_downstream(conn, procedure)
417
- data_job = MSSQLDataJob(
418
- entity=procedure,
419
- )
420
- # TODO: because of this upstream and downstream are more dependencies,
421
- # can't be used as DataJobInputOutput.
422
- # Should be reorganized into lineage.
423
- data_job.add_property("procedure_depends_on", str(upstream.as_property))
424
- data_job.add_property(
425
- "depending_on_procedure", str(downstream.as_property)
426
- )
427
- procedure_definition, procedure_code = self._get_procedure_code(
428
- conn, procedure
429
- )
430
- if procedure_definition:
431
- data_job.add_property("definition", procedure_definition)
432
- if sql_config.include_stored_procedures_code and procedure_code:
433
- data_job.add_property("code", procedure_code)
434
- procedure_inputs = self._get_procedure_inputs(conn, procedure)
435
- properties = self._get_procedure_properties(conn, procedure)
436
- data_job.add_property(
437
- "input parameters", str([param.name for param in procedure_inputs])
438
- )
439
- for param in procedure_inputs:
440
- data_job.add_property(
441
- f"parameter {param.name}", str(param.properties)
442
- )
443
- for property_name, property_value in properties.items():
444
- data_job.add_property(property_name, str(property_value))
445
- yield from self.construct_job_workunits(data_job)
426
+ yield from self._process_stored_procedure(conn, procedure)
427
+
428
+ def _process_stored_procedure(
429
+ self, conn: Connection, procedure: StoredProcedure
430
+ ) -> Iterable[MetadataWorkUnit]:
431
+ upstream = self._get_procedure_upstream(conn, procedure)
432
+ downstream = self._get_procedure_downstream(conn, procedure)
433
+ data_job = MSSQLDataJob(
434
+ entity=procedure,
435
+ )
436
+ # TODO: because of this upstream and downstream are more dependencies,
437
+ # can't be used as DataJobInputOutput.
438
+ # Should be reorganized into lineage.
439
+ data_job.add_property("procedure_depends_on", str(upstream.as_property))
440
+ data_job.add_property("depending_on_procedure", str(downstream.as_property))
441
+ procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
442
+ procedure.code = procedure_code
443
+ if procedure_definition:
444
+ data_job.add_property("definition", procedure_definition)
445
+ if procedure_code and self.config.include_stored_procedures_code:
446
+ data_job.add_property("code", procedure_code)
447
+ procedure_inputs = self._get_procedure_inputs(conn, procedure)
448
+ properties = self._get_procedure_properties(conn, procedure)
449
+ data_job.add_property(
450
+ "input parameters", str([param.name for param in procedure_inputs])
451
+ )
452
+ for param in procedure_inputs:
453
+ data_job.add_property(f"parameter {param.name}", str(param.properties))
454
+ for property_name, property_value in properties.items():
455
+ data_job.add_property(property_name, str(property_value))
456
+ if self.config.include_lineage:
457
+ # These will be used to construct lineage
458
+ self.stored_procedures.append(procedure)
459
+ yield from self.construct_job_workunits(
460
+ data_job,
461
+ # For stored procedure lineage is ingested later
462
+ include_lineage=False,
463
+ )
446
464
 
447
465
  @staticmethod
448
466
  def _get_procedure_downstream(
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
546
564
  code_list.append(row["Text"])
547
565
  if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
548
566
  code_slice_index = index
549
- definition = "\n".join(code_list[:code_slice_index])
550
- code = "\n".join(code_list[code_slice_index:])
567
+ definition = "".join(code_list[:code_slice_index])
568
+ code = "".join(code_list[code_slice_index:])
551
569
  except ResourceClosedError:
552
570
  logger.warning(
553
571
  "Connection was closed from procedure '%s'",
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
602
620
  def construct_job_workunits(
603
621
  self,
604
622
  data_job: MSSQLDataJob,
623
+ include_lineage: bool = True,
605
624
  ) -> Iterable[MetadataWorkUnit]:
606
625
  yield MetadataChangeProposalWrapper(
607
626
  entityUrn=data_job.urn,
608
627
  aspect=data_job.as_datajob_info_aspect,
609
628
  ).as_workunit()
610
629
 
611
- yield MetadataChangeProposalWrapper(
612
- entityUrn=data_job.urn,
613
- aspect=data_job.as_datajob_input_output_aspect,
614
- ).as_workunit()
630
+ if include_lineage:
631
+ yield MetadataChangeProposalWrapper(
632
+ entityUrn=data_job.urn,
633
+ aspect=data_job.as_datajob_input_output_aspect,
634
+ ).as_workunit()
615
635
  # TODO: Add SubType when it appear
616
636
 
617
637
  def construct_flow_workunits(
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
664
684
  if self.config.convert_urns_to_lowercase
665
685
  else qualified_table_name
666
686
  )
687
+
688
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
689
+ yield from super().get_workunits_internal()
690
+
691
+ # This is done at the end so that we will have access to tables
692
+ # from all databases in schema_resolver and discovered_tables
693
+ for procedure in self.stored_procedures:
694
+ with self.report.report_exc(
695
+ message="Failed to parse stored procedure lineage",
696
+ context=procedure.full_name,
697
+ level=StructuredLogLevel.WARN,
698
+ ):
699
+ yield from auto_workunit(
700
+ generate_procedure_lineage(
701
+ schema_resolver=self.schema_resolver,
702
+ procedure=procedure,
703
+ procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
704
+ is_temp_table=self.is_temp_table,
705
+ )
706
+ )
707
+
708
+ def is_temp_table(self, name: str) -> bool:
709
+ try:
710
+ parts = name.split(".")
711
+ table_name = parts[-1]
712
+ schema_name = parts[-2]
713
+ db_name = parts[-3]
714
+
715
+ if table_name.startswith("#"):
716
+ return True
717
+
718
+ # This is also a temp table if
719
+ # 1. this name would be allowed by the dataset patterns, and
720
+ # 2. we have a list of discovered tables, and
721
+ # 3. it's not in the discovered tables list
722
+ if (
723
+ self.config.database_pattern.allowed(db_name)
724
+ and self.config.schema_pattern.allowed(schema_name)
725
+ and self.config.table_pattern.allowed(name)
726
+ and self.standardize_identifier_case(name)
727
+ not in self.discovered_datasets
728
+ ):
729
+ logger.debug(f"inferred as temp table {name}")
730
+ return True
731
+
732
+ except Exception:
733
+ logger.warning(f"Error parsing table name {name} ")
734
+ return False
735
+
736
+ def standardize_identifier_case(self, table_ref_str: str) -> str:
737
+ return (
738
+ table_ref_str.lower()
739
+ if self.config.convert_urns_to_lowercase
740
+ else table_ref_str
741
+ )
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from typing import Callable, Iterable, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
6
+ from datahub.metadata.schema_classes import DataJobInputOutputClass
7
+ from datahub.sql_parsing.datajob import to_datajob_input_output
8
+ from datahub.sql_parsing.schema_resolver import SchemaResolver
9
+ from datahub.sql_parsing.split_statements import split_statements
10
+ from datahub.sql_parsing.sql_parsing_aggregator import (
11
+ ObservedQuery,
12
+ SqlParsingAggregator,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def parse_procedure_code(
19
+ *,
20
+ schema_resolver: SchemaResolver,
21
+ default_db: Optional[str],
22
+ default_schema: Optional[str],
23
+ code: str,
24
+ is_temp_table: Callable[[str], bool],
25
+ raise_: bool = False,
26
+ ) -> Optional[DataJobInputOutputClass]:
27
+ aggregator = SqlParsingAggregator(
28
+ platform=schema_resolver.platform,
29
+ env=schema_resolver.env,
30
+ schema_resolver=schema_resolver,
31
+ generate_lineage=True,
32
+ generate_queries=False,
33
+ generate_usage_statistics=False,
34
+ generate_operations=False,
35
+ generate_query_subject_fields=False,
36
+ generate_query_usage_statistics=False,
37
+ is_temp_table=is_temp_table,
38
+ )
39
+ for query in split_statements(code):
40
+ # TODO: We should take into account `USE x` statements.
41
+ aggregator.add_observed_query(
42
+ observed=ObservedQuery(
43
+ default_db=default_db,
44
+ default_schema=default_schema,
45
+ query=query,
46
+ )
47
+ )
48
+ if aggregator.report.num_observed_queries_failed and raise_:
49
+ logger.info(aggregator.report.as_string())
50
+ raise ValueError(
51
+ f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
52
+ )
53
+
54
+ mcps = list(aggregator.gen_metadata())
55
+ return to_datajob_input_output(
56
+ mcps=mcps,
57
+ ignore_extra_mcps=True,
58
+ )
59
+
60
+
61
+ # Is procedure handling generic enough to be added to SqlParsingAggregator?
62
+ def generate_procedure_lineage(
63
+ *,
64
+ schema_resolver: SchemaResolver,
65
+ procedure: StoredProcedure,
66
+ procedure_job_urn: str,
67
+ is_temp_table: Callable[[str], bool] = lambda _: False,
68
+ raise_: bool = False,
69
+ ) -> Iterable[MetadataChangeProposalWrapper]:
70
+ if procedure.code:
71
+ datajob_input_output = parse_procedure_code(
72
+ schema_resolver=schema_resolver,
73
+ default_db=procedure.db,
74
+ default_schema=procedure.schema,
75
+ code=procedure.code,
76
+ is_temp_table=is_temp_table,
77
+ raise_=raise_,
78
+ )
79
+
80
+ if datajob_input_output:
81
+ yield MetadataChangeProposalWrapper(
82
+ entityUrn=procedure_job_urn,
83
+ aspect=datajob_input_output,
84
+ )
@@ -392,6 +392,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
392
392
  platform_instance=self.config.platform_instance,
393
393
  env=self.config.env,
394
394
  )
395
+ self.discovered_datasets: Set[str] = set()
395
396
  self._view_definition_cache: MutableMapping[str, str]
396
397
  if self.config.use_file_backed_cache:
397
398
  self._view_definition_cache = FileBackedDict[str]()
@@ -831,8 +832,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
831
832
  self._classify(dataset_name, schema, table, data_reader, schema_metadata)
832
833
 
833
834
  dataset_snapshot.aspects.append(schema_metadata)
834
- if self.config.include_view_lineage:
835
+ if self._save_schema_to_resolver():
835
836
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
837
+ self.discovered_datasets.add(dataset_name)
836
838
  db_name = self.get_db_name(inspector)
837
839
 
838
840
  yield from self.add_table_to_schema_container(
@@ -1126,8 +1128,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1126
1128
  columns,
1127
1129
  canonical_schema=schema_fields,
1128
1130
  )
1129
- if self.config.include_view_lineage:
1131
+ if self._save_schema_to_resolver():
1130
1132
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1133
+ self.discovered_datasets.add(dataset_name)
1131
1134
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1132
1135
  try:
1133
1136
  view_definition = inspector.get_view_definition(view, schema)
@@ -1190,6 +1193,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1190
1193
  domain_registry=self.domain_registry,
1191
1194
  )
1192
1195
 
1196
+ def _save_schema_to_resolver(self):
1197
+ return self.config.include_view_lineage or (
1198
+ hasattr(self.config, "include_lineage") and self.config.include_lineage
1199
+ )
1200
+
1193
1201
  def _run_sql_parser(
1194
1202
  self, view_identifier: str, query: str, schema_resolver: SchemaResolver
1195
1203
  ) -> Optional[SqlParsingResult]:
@@ -0,0 +1,50 @@
1
+ import logging
2
+ from typing import Iterable, List, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.metadata.schema_classes import (
6
+ DataJobInputOutputClass,
7
+ FineGrainedLineageClass,
8
+ UpstreamLineageClass,
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def to_datajob_input_output(
15
+ *, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
16
+ ) -> Optional[DataJobInputOutputClass]:
17
+ inputDatasets: List[str] = []
18
+ outputDatasets: List[str] = []
19
+ fineGrainedLineages: List[FineGrainedLineageClass] = []
20
+ for mcp in mcps:
21
+ # TODO: Represent simple write operations without lineage as outputDatasets.
22
+
23
+ upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
24
+ if upstream_lineage is not None:
25
+ if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
26
+ outputDatasets.append(mcp.entityUrn)
27
+
28
+ for upstream in upstream_lineage.upstreams:
29
+ if upstream.dataset not in inputDatasets:
30
+ inputDatasets.append(upstream.dataset)
31
+
32
+ if upstream_lineage.fineGrainedLineages:
33
+ for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
34
+ fineGrainedLineages.append(fineGrainedLineage)
35
+
36
+ elif ignore_extra_mcps:
37
+ pass
38
+ else:
39
+ raise ValueError(
40
+ f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
41
+ )
42
+
43
+ if not inputDatasets and not outputDatasets:
44
+ return None
45
+
46
+ return DataJobInputOutputClass(
47
+ inputDatasets=inputDatasets,
48
+ outputDatasets=outputDatasets,
49
+ fineGrainedLineages=fineGrainedLineages,
50
+ )
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
14
14
  identifier: sqlglot.exp.Identifier = table.this
15
15
 
16
16
  return identifier.args.get("temporary") or (
17
- is_dialect_instance(dialect, "redshift") and identifier.name.startswith("#")
17
+ # These dialects use # as a prefix for temp tables.
18
+ is_dialect_instance(
19
+ dialect,
20
+ [
21
+ "redshift",
22
+ "mssql",
23
+ # sybase is another one, but we don't support that dialect yet.
24
+ ],
25
+ )
26
+ and identifier.name.startswith("#")
18
27
  )
19
28
 
20
29
 
@@ -0,0 +1,163 @@
1
+ import re
2
+ from enum import Enum
3
+ from typing import Generator, List, Tuple
4
+
5
+ CONTROL_FLOW_KEYWORDS = [
6
+ "GO",
7
+ r"BEGIN\w+TRY",
8
+ r"BEGIN\w+CATCH",
9
+ "BEGIN",
10
+ r"END\w+TRY",
11
+ r"END\w+CATCH",
12
+ "END",
13
+ ]
14
+
15
+ # There's an exception to this rule, which is when the statement
16
+ # is preceeded by a CTE.
17
+ FORCE_NEW_STATEMENT_KEYWORDS = [
18
+ # SELECT is used inside queries as well, so we can't include it here.
19
+ "INSERT",
20
+ "UPDATE",
21
+ "DELETE",
22
+ "MERGE",
23
+ ]
24
+
25
+
26
+ class ParserState(Enum):
27
+ NORMAL = 1
28
+ STRING = 2
29
+ COMMENT = 3
30
+ MULTILINE_COMMENT = 4
31
+
32
+
33
+ def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
34
+ """
35
+ Check if a keyword exists at the given position using regex word boundaries.
36
+ """
37
+ if pos + len(keyword) > len(sql):
38
+ return False
39
+
40
+ # If we're not at a word boundary, we can't generate a keyword.
41
+ if pos > 0 and not (
42
+ bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
43
+ or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
44
+ ):
45
+ return False
46
+
47
+ pattern = rf"^{re.escape(keyword)}\b"
48
+ match = re.match(pattern, sql[pos:], re.IGNORECASE)
49
+ return bool(match)
50
+
51
+
52
+ def _look_ahead_for_keywords(
53
+ sql: str, pos: int, keywords: List[str]
54
+ ) -> Tuple[bool, str, int]:
55
+ """
56
+ Look ahead for SQL keywords at the current position.
57
+ """
58
+
59
+ for keyword in keywords:
60
+ if _is_keyword_at_position(sql, pos, keyword):
61
+ return True, keyword, len(keyword)
62
+ return False, "", 0
63
+
64
+
65
+ def split_statements(sql: str) -> Generator[str, None, None]:
66
+ """
67
+ Split T-SQL code into individual statements, handling various SQL constructs.
68
+ """
69
+ if not sql or not sql.strip():
70
+ return
71
+
72
+ current_statement: List[str] = []
73
+ state = ParserState.NORMAL
74
+ i = 0
75
+
76
+ def yield_if_complete() -> Generator[str, None, None]:
77
+ statement = "".join(current_statement).strip()
78
+ if statement:
79
+ yield statement
80
+ current_statement.clear()
81
+
82
+ prev_real_char = "\0" # the most recent non-whitespace, non-comment character
83
+ while i < len(sql):
84
+ c = sql[i]
85
+ next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
86
+
87
+ if state == ParserState.NORMAL:
88
+ if c == "'":
89
+ state = ParserState.STRING
90
+ current_statement.append(c)
91
+ prev_real_char = c
92
+ elif c == "-" and next_char == "-":
93
+ state = ParserState.COMMENT
94
+ current_statement.append(c)
95
+ current_statement.append(next_char)
96
+ i += 1
97
+ elif c == "/" and next_char == "*":
98
+ state = ParserState.MULTILINE_COMMENT
99
+ current_statement.append(c)
100
+ current_statement.append(next_char)
101
+ i += 1
102
+ else:
103
+ most_recent_real_char = prev_real_char
104
+ if not c.isspace():
105
+ prev_real_char = c
106
+
107
+ is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
108
+ sql, i, keywords=CONTROL_FLOW_KEYWORDS
109
+ )
110
+ if is_control_keyword:
111
+ # Yield current statement if any
112
+ yield from yield_if_complete()
113
+ # Yield keyword as its own statement
114
+ yield keyword
115
+ i += keyword_len
116
+ continue
117
+
118
+ (
119
+ is_force_new_statement_keyword,
120
+ keyword,
121
+ keyword_len,
122
+ ) = _look_ahead_for_keywords(
123
+ sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
124
+ )
125
+ if (
126
+ is_force_new_statement_keyword and most_recent_real_char != ")"
127
+ ): # usually we'd have a close paren that closes a CTE
128
+ # Force termination of current statement
129
+ yield from yield_if_complete()
130
+
131
+ current_statement.append(keyword)
132
+ i += keyword_len
133
+ continue
134
+
135
+ elif c == ";":
136
+ yield from yield_if_complete()
137
+ else:
138
+ current_statement.append(c)
139
+
140
+ elif state == ParserState.STRING:
141
+ current_statement.append(c)
142
+ if c == "'" and next_char == "'":
143
+ current_statement.append(next_char)
144
+ i += 1
145
+ elif c == "'":
146
+ state = ParserState.NORMAL
147
+
148
+ elif state == ParserState.COMMENT:
149
+ current_statement.append(c)
150
+ if c == "\n":
151
+ state = ParserState.NORMAL
152
+
153
+ elif state == ParserState.MULTILINE_COMMENT:
154
+ current_statement.append(c)
155
+ if c == "*" and next_char == "/":
156
+ current_statement.append(next_char)
157
+ i += 1
158
+ state = ParserState.NORMAL
159
+
160
+ i += 1
161
+
162
+ # Handle the last statement
163
+ yield from yield_if_complete()
@@ -762,7 +762,6 @@ class SqlParsingAggregator(Closeable):
762
762
 
763
763
  This assumes that queries come in order of increasing timestamps.
764
764
  """
765
-
766
765
  self.report.num_observed_queries += 1
767
766
 
768
767
  # All queries with no session ID are assumed to be part of the same session.