acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/METADATA +2306 -2306
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/RECORD +14 -11
- datahub/__init__.py +1 -1
- datahub/ingestion/source/sql/mssql/job_models.py +1 -0
- datahub/ingestion/source/sql/mssql/source.py +113 -38
- datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
- datahub/ingestion/source/sql/sql_common.py +10 -2
- datahub/sql_parsing/datajob.py +50 -0
- datahub/sql_parsing/query_types.py +10 -1
- datahub/sql_parsing/split_statements.py +163 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=sacWO6qm2tPLBhc25GFoFnt_AeiT0Qk9ZiibpHJbPhQ,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -449,7 +449,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
|
|
|
449
449
|
datahub/ingestion/source/sql/oracle.py,sha256=exJJvMSC42Oj0IiFtpAPFatWkRu9mVu8sd7ofOdanqU,22460
|
|
450
450
|
datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
|
|
451
451
|
datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
|
|
452
|
-
datahub/ingestion/source/sql/sql_common.py,sha256=
|
|
452
|
+
datahub/ingestion/source/sql/sql_common.py,sha256=Qe481Pct1vfl-NVpgOufSEaNSMgJGD3bq1DPvjIkPTg,52164
|
|
453
453
|
datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
|
|
454
454
|
datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
|
|
455
455
|
datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=vWna43rv9ClfK9MBNTx8tdoUr35nD5d8ppgeamEEhSQ,12528
|
|
@@ -462,8 +462,9 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
|
|
|
462
462
|
datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
|
|
463
463
|
datahub/ingestion/source/sql/vertica.py,sha256=pkx-1JDBmow7WhoEIEh0SLj7kff9L0zUOHDytoC43gk,33339
|
|
464
464
|
datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
|
|
465
|
-
datahub/ingestion/source/sql/mssql/job_models.py,sha256
|
|
466
|
-
datahub/ingestion/source/sql/mssql/source.py,sha256=
|
|
465
|
+
datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
|
|
466
|
+
datahub/ingestion/source/sql/mssql/source.py,sha256=fzpWjwexGvJgpd6Z4DCsK6Ld2vQCfPkD2M1xE4pU9Ec,29542
|
|
467
|
+
datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
|
|
467
468
|
datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
468
469
|
datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
|
|
469
470
|
datahub/ingestion/source/state/entity_removal_state.py,sha256=zvIsmYg7oiIu2FhecU0VfLBNToUqvKoKyDeiFfkOcyc,6611
|
|
@@ -858,9 +859,11 @@ datahub/specific/structured_property.py,sha256=IYeFyafPidNrDbn1sU65rEPwIZDS-wLY1
|
|
|
858
859
|
datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
859
860
|
datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
|
|
860
861
|
datahub/sql_parsing/_sqlglot_patch.py,sha256=iYJ8zOThHqqbamD5jdNr9iHTWD7ewNeHzPiTb6-rO3Y,7043
|
|
861
|
-
datahub/sql_parsing/
|
|
862
|
+
datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
|
|
863
|
+
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
862
864
|
datahub/sql_parsing/schema_resolver.py,sha256=9wbJT80K4nsIHHOuQLso9QoFQAYwfSJZnqHwsaU3UTY,10197
|
|
863
|
-
datahub/sql_parsing/
|
|
865
|
+
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
866
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=gLelf5l73EufB8qijb9ZDLANkt4o05schGg4DY-bOJs,69937
|
|
864
867
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
865
868
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
866
869
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=K532_zvj7vEt2Ci2d3kxYZLZkq4S0ECBRZ2nHuY11a4,45927
|
|
@@ -967,8 +970,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
967
970
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
968
971
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
969
972
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
970
|
-
acryl_datahub-0.14.1.
|
|
971
|
-
acryl_datahub-0.14.1.
|
|
972
|
-
acryl_datahub-0.14.1.
|
|
973
|
-
acryl_datahub-0.14.1.
|
|
974
|
-
acryl_datahub-0.14.1.
|
|
973
|
+
acryl_datahub-0.14.1.13rc5.dist-info/METADATA,sha256=9ZZBowQ_PYKf6d4V6EHqTTlJDY77BqxSX8uKfiIqFT4,171138
|
|
974
|
+
acryl_datahub-0.14.1.13rc5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
975
|
+
acryl_datahub-0.14.1.13rc5.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
|
|
976
|
+
acryl_datahub-0.14.1.13rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
977
|
+
acryl_datahub-0.14.1.13rc5.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
|
|
|
24
24
|
platform_name,
|
|
25
25
|
support_status,
|
|
26
26
|
)
|
|
27
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
28
|
+
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
27
29
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
30
|
from datahub.ingestion.source.sql.mssql.job_models import (
|
|
29
31
|
JobStep,
|
|
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
36
38
|
ProcedureParameter,
|
|
37
39
|
StoredProcedure,
|
|
38
40
|
)
|
|
41
|
+
from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
|
|
42
|
+
generate_procedure_lineage,
|
|
43
|
+
)
|
|
39
44
|
from datahub.ingestion.source.sql.sql_common import (
|
|
40
45
|
SQLAlchemySource,
|
|
41
46
|
SqlWorkUnit,
|
|
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
|
|
|
51
56
|
StringTypeClass,
|
|
52
57
|
UnionTypeClass,
|
|
53
58
|
)
|
|
59
|
+
from datahub.utilities.file_backed_collections import FileBackedList
|
|
54
60
|
|
|
55
61
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
56
62
|
|
|
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
99
105
|
default=False,
|
|
100
106
|
description="Enable to convert the SQL Server assets urns to lowercase",
|
|
101
107
|
)
|
|
108
|
+
include_lineage: bool = Field(
|
|
109
|
+
default=True,
|
|
110
|
+
description="Enable lineage extraction for stored procedures",
|
|
111
|
+
)
|
|
102
112
|
|
|
103
113
|
@pydantic.validator("uri_args")
|
|
104
114
|
def passwords_match(cls, v, values, **kwargs):
|
|
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
161
171
|
self.current_database = None
|
|
162
172
|
self.table_descriptions: Dict[str, str] = {}
|
|
163
173
|
self.column_descriptions: Dict[str, str] = {}
|
|
174
|
+
self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
|
|
164
175
|
if self.config.include_descriptions:
|
|
165
176
|
for inspector in self.get_inspectors():
|
|
166
177
|
db_name: str = self.get_db_name(inspector)
|
|
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
374
385
|
def loop_job_steps(
|
|
375
386
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
376
387
|
) -> Iterable[MetadataWorkUnit]:
|
|
377
|
-
for
|
|
388
|
+
for _step_id, step_data in job_steps.items():
|
|
378
389
|
step = JobStep(
|
|
379
390
|
job_name=job.formatted_name,
|
|
380
391
|
step_name=step_data["step_name"],
|
|
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
412
423
|
if procedures:
|
|
413
424
|
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
414
425
|
for procedure in procedures:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
426
|
+
yield from self._process_stored_procedure(conn, procedure)
|
|
427
|
+
|
|
428
|
+
def _process_stored_procedure(
|
|
429
|
+
self, conn: Connection, procedure: StoredProcedure
|
|
430
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
431
|
+
upstream = self._get_procedure_upstream(conn, procedure)
|
|
432
|
+
downstream = self._get_procedure_downstream(conn, procedure)
|
|
433
|
+
data_job = MSSQLDataJob(
|
|
434
|
+
entity=procedure,
|
|
435
|
+
)
|
|
436
|
+
# TODO: because of this upstream and downstream are more dependencies,
|
|
437
|
+
# can't be used as DataJobInputOutput.
|
|
438
|
+
# Should be reorganized into lineage.
|
|
439
|
+
data_job.add_property("procedure_depends_on", str(upstream.as_property))
|
|
440
|
+
data_job.add_property("depending_on_procedure", str(downstream.as_property))
|
|
441
|
+
procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
|
|
442
|
+
procedure.code = procedure_code
|
|
443
|
+
if procedure_definition:
|
|
444
|
+
data_job.add_property("definition", procedure_definition)
|
|
445
|
+
if procedure_code and self.config.include_stored_procedures_code:
|
|
446
|
+
data_job.add_property("code", procedure_code)
|
|
447
|
+
procedure_inputs = self._get_procedure_inputs(conn, procedure)
|
|
448
|
+
properties = self._get_procedure_properties(conn, procedure)
|
|
449
|
+
data_job.add_property(
|
|
450
|
+
"input parameters", str([param.name for param in procedure_inputs])
|
|
451
|
+
)
|
|
452
|
+
for param in procedure_inputs:
|
|
453
|
+
data_job.add_property(f"parameter {param.name}", str(param.properties))
|
|
454
|
+
for property_name, property_value in properties.items():
|
|
455
|
+
data_job.add_property(property_name, str(property_value))
|
|
456
|
+
if self.config.include_lineage:
|
|
457
|
+
# These will be used to construct lineage
|
|
458
|
+
self.stored_procedures.append(procedure)
|
|
459
|
+
yield from self.construct_job_workunits(
|
|
460
|
+
data_job,
|
|
461
|
+
# For stored procedure lineage is ingested later
|
|
462
|
+
include_lineage=False,
|
|
463
|
+
)
|
|
446
464
|
|
|
447
465
|
@staticmethod
|
|
448
466
|
def _get_procedure_downstream(
|
|
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
546
564
|
code_list.append(row["Text"])
|
|
547
565
|
if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
|
|
548
566
|
code_slice_index = index
|
|
549
|
-
definition = "
|
|
550
|
-
code = "
|
|
567
|
+
definition = "".join(code_list[:code_slice_index])
|
|
568
|
+
code = "".join(code_list[code_slice_index:])
|
|
551
569
|
except ResourceClosedError:
|
|
552
570
|
logger.warning(
|
|
553
571
|
"Connection was closed from procedure '%s'",
|
|
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
602
620
|
def construct_job_workunits(
|
|
603
621
|
self,
|
|
604
622
|
data_job: MSSQLDataJob,
|
|
623
|
+
include_lineage: bool = True,
|
|
605
624
|
) -> Iterable[MetadataWorkUnit]:
|
|
606
625
|
yield MetadataChangeProposalWrapper(
|
|
607
626
|
entityUrn=data_job.urn,
|
|
608
627
|
aspect=data_job.as_datajob_info_aspect,
|
|
609
628
|
).as_workunit()
|
|
610
629
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
630
|
+
if include_lineage:
|
|
631
|
+
yield MetadataChangeProposalWrapper(
|
|
632
|
+
entityUrn=data_job.urn,
|
|
633
|
+
aspect=data_job.as_datajob_input_output_aspect,
|
|
634
|
+
).as_workunit()
|
|
615
635
|
# TODO: Add SubType when it appear
|
|
616
636
|
|
|
617
637
|
def construct_flow_workunits(
|
|
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
664
684
|
if self.config.convert_urns_to_lowercase
|
|
665
685
|
else qualified_table_name
|
|
666
686
|
)
|
|
687
|
+
|
|
688
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
689
|
+
yield from super().get_workunits_internal()
|
|
690
|
+
|
|
691
|
+
# This is done at the end so that we will have access to tables
|
|
692
|
+
# from all databases in schema_resolver and discovered_tables
|
|
693
|
+
for procedure in self.stored_procedures:
|
|
694
|
+
with self.report.report_exc(
|
|
695
|
+
message="Failed to parse stored procedure lineage",
|
|
696
|
+
context=procedure.full_name,
|
|
697
|
+
level=StructuredLogLevel.WARN,
|
|
698
|
+
):
|
|
699
|
+
yield from auto_workunit(
|
|
700
|
+
generate_procedure_lineage(
|
|
701
|
+
schema_resolver=self.schema_resolver,
|
|
702
|
+
procedure=procedure,
|
|
703
|
+
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
704
|
+
is_temp_table=self.is_temp_table,
|
|
705
|
+
)
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
def is_temp_table(self, name: str) -> bool:
|
|
709
|
+
try:
|
|
710
|
+
parts = name.split(".")
|
|
711
|
+
table_name = parts[-1]
|
|
712
|
+
schema_name = parts[-2]
|
|
713
|
+
db_name = parts[-3]
|
|
714
|
+
|
|
715
|
+
if table_name.startswith("#"):
|
|
716
|
+
return True
|
|
717
|
+
|
|
718
|
+
# This is also a temp table if
|
|
719
|
+
# 1. this name would be allowed by the dataset patterns, and
|
|
720
|
+
# 2. we have a list of discovered tables, and
|
|
721
|
+
# 3. it's not in the discovered tables list
|
|
722
|
+
if (
|
|
723
|
+
self.config.database_pattern.allowed(db_name)
|
|
724
|
+
and self.config.schema_pattern.allowed(schema_name)
|
|
725
|
+
and self.config.table_pattern.allowed(name)
|
|
726
|
+
and self.standardize_identifier_case(name)
|
|
727
|
+
not in self.discovered_datasets
|
|
728
|
+
):
|
|
729
|
+
logger.debug(f"inferred as temp table {name}")
|
|
730
|
+
return True
|
|
731
|
+
|
|
732
|
+
except Exception:
|
|
733
|
+
logger.warning(f"Error parsing table name {name} ")
|
|
734
|
+
return False
|
|
735
|
+
|
|
736
|
+
def standardize_identifier_case(self, table_ref_str: str) -> str:
|
|
737
|
+
return (
|
|
738
|
+
table_ref_str.lower()
|
|
739
|
+
if self.config.convert_urns_to_lowercase
|
|
740
|
+
else table_ref_str
|
|
741
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
+
from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
|
|
6
|
+
from datahub.metadata.schema_classes import DataJobInputOutputClass
|
|
7
|
+
from datahub.sql_parsing.datajob import to_datajob_input_output
|
|
8
|
+
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
9
|
+
from datahub.sql_parsing.split_statements import split_statements
|
|
10
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
11
|
+
ObservedQuery,
|
|
12
|
+
SqlParsingAggregator,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse_procedure_code(
|
|
19
|
+
*,
|
|
20
|
+
schema_resolver: SchemaResolver,
|
|
21
|
+
default_db: Optional[str],
|
|
22
|
+
default_schema: Optional[str],
|
|
23
|
+
code: str,
|
|
24
|
+
is_temp_table: Callable[[str], bool],
|
|
25
|
+
raise_: bool = False,
|
|
26
|
+
) -> Optional[DataJobInputOutputClass]:
|
|
27
|
+
aggregator = SqlParsingAggregator(
|
|
28
|
+
platform=schema_resolver.platform,
|
|
29
|
+
env=schema_resolver.env,
|
|
30
|
+
schema_resolver=schema_resolver,
|
|
31
|
+
generate_lineage=True,
|
|
32
|
+
generate_queries=False,
|
|
33
|
+
generate_usage_statistics=False,
|
|
34
|
+
generate_operations=False,
|
|
35
|
+
generate_query_subject_fields=False,
|
|
36
|
+
generate_query_usage_statistics=False,
|
|
37
|
+
is_temp_table=is_temp_table,
|
|
38
|
+
)
|
|
39
|
+
for query in split_statements(code):
|
|
40
|
+
# TODO: We should take into account `USE x` statements.
|
|
41
|
+
aggregator.add_observed_query(
|
|
42
|
+
observed=ObservedQuery(
|
|
43
|
+
default_db=default_db,
|
|
44
|
+
default_schema=default_schema,
|
|
45
|
+
query=query,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
if aggregator.report.num_observed_queries_failed and raise_:
|
|
49
|
+
logger.info(aggregator.report.as_string())
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
mcps = list(aggregator.gen_metadata())
|
|
55
|
+
return to_datajob_input_output(
|
|
56
|
+
mcps=mcps,
|
|
57
|
+
ignore_extra_mcps=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Is procedure handling generic enough to be added to SqlParsingAggregator?
|
|
62
|
+
def generate_procedure_lineage(
|
|
63
|
+
*,
|
|
64
|
+
schema_resolver: SchemaResolver,
|
|
65
|
+
procedure: StoredProcedure,
|
|
66
|
+
procedure_job_urn: str,
|
|
67
|
+
is_temp_table: Callable[[str], bool] = lambda _: False,
|
|
68
|
+
raise_: bool = False,
|
|
69
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
70
|
+
if procedure.code:
|
|
71
|
+
datajob_input_output = parse_procedure_code(
|
|
72
|
+
schema_resolver=schema_resolver,
|
|
73
|
+
default_db=procedure.db,
|
|
74
|
+
default_schema=procedure.schema,
|
|
75
|
+
code=procedure.code,
|
|
76
|
+
is_temp_table=is_temp_table,
|
|
77
|
+
raise_=raise_,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if datajob_input_output:
|
|
81
|
+
yield MetadataChangeProposalWrapper(
|
|
82
|
+
entityUrn=procedure_job_urn,
|
|
83
|
+
aspect=datajob_input_output,
|
|
84
|
+
)
|
|
@@ -392,6 +392,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
392
392
|
platform_instance=self.config.platform_instance,
|
|
393
393
|
env=self.config.env,
|
|
394
394
|
)
|
|
395
|
+
self.discovered_datasets: Set[str] = set()
|
|
395
396
|
self._view_definition_cache: MutableMapping[str, str]
|
|
396
397
|
if self.config.use_file_backed_cache:
|
|
397
398
|
self._view_definition_cache = FileBackedDict[str]()
|
|
@@ -831,8 +832,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
831
832
|
self._classify(dataset_name, schema, table, data_reader, schema_metadata)
|
|
832
833
|
|
|
833
834
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
834
|
-
if self.
|
|
835
|
+
if self._save_schema_to_resolver():
|
|
835
836
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
837
|
+
self.discovered_datasets.add(dataset_name)
|
|
836
838
|
db_name = self.get_db_name(inspector)
|
|
837
839
|
|
|
838
840
|
yield from self.add_table_to_schema_container(
|
|
@@ -1126,8 +1128,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1126
1128
|
columns,
|
|
1127
1129
|
canonical_schema=schema_fields,
|
|
1128
1130
|
)
|
|
1129
|
-
if self.
|
|
1131
|
+
if self._save_schema_to_resolver():
|
|
1130
1132
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
1133
|
+
self.discovered_datasets.add(dataset_name)
|
|
1131
1134
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
1132
1135
|
try:
|
|
1133
1136
|
view_definition = inspector.get_view_definition(view, schema)
|
|
@@ -1190,6 +1193,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1190
1193
|
domain_registry=self.domain_registry,
|
|
1191
1194
|
)
|
|
1192
1195
|
|
|
1196
|
+
def _save_schema_to_resolver(self):
|
|
1197
|
+
return self.config.include_view_lineage or (
|
|
1198
|
+
hasattr(self.config, "include_lineage") and self.config.include_lineage
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1193
1201
|
def _run_sql_parser(
|
|
1194
1202
|
self, view_identifier: str, query: str, schema_resolver: SchemaResolver
|
|
1195
1203
|
) -> Optional[SqlParsingResult]:
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, List, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
+
from datahub.metadata.schema_classes import (
|
|
6
|
+
DataJobInputOutputClass,
|
|
7
|
+
FineGrainedLineageClass,
|
|
8
|
+
UpstreamLineageClass,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def to_datajob_input_output(
|
|
15
|
+
*, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True
|
|
16
|
+
) -> Optional[DataJobInputOutputClass]:
|
|
17
|
+
inputDatasets: List[str] = []
|
|
18
|
+
outputDatasets: List[str] = []
|
|
19
|
+
fineGrainedLineages: List[FineGrainedLineageClass] = []
|
|
20
|
+
for mcp in mcps:
|
|
21
|
+
# TODO: Represent simple write operations without lineage as outputDatasets.
|
|
22
|
+
|
|
23
|
+
upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass)
|
|
24
|
+
if upstream_lineage is not None:
|
|
25
|
+
if mcp.entityUrn and mcp.entityUrn not in outputDatasets:
|
|
26
|
+
outputDatasets.append(mcp.entityUrn)
|
|
27
|
+
|
|
28
|
+
for upstream in upstream_lineage.upstreams:
|
|
29
|
+
if upstream.dataset not in inputDatasets:
|
|
30
|
+
inputDatasets.append(upstream.dataset)
|
|
31
|
+
|
|
32
|
+
if upstream_lineage.fineGrainedLineages:
|
|
33
|
+
for fineGrainedLineage in upstream_lineage.fineGrainedLineages:
|
|
34
|
+
fineGrainedLineages.append(fineGrainedLineage)
|
|
35
|
+
|
|
36
|
+
elif ignore_extra_mcps:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if not inputDatasets and not outputDatasets:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return DataJobInputOutputClass(
|
|
47
|
+
inputDatasets=inputDatasets,
|
|
48
|
+
outputDatasets=outputDatasets,
|
|
49
|
+
fineGrainedLineages=fineGrainedLineages,
|
|
50
|
+
)
|
|
@@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool:
|
|
|
14
14
|
identifier: sqlglot.exp.Identifier = table.this
|
|
15
15
|
|
|
16
16
|
return identifier.args.get("temporary") or (
|
|
17
|
-
|
|
17
|
+
# These dialects use # as a prefix for temp tables.
|
|
18
|
+
is_dialect_instance(
|
|
19
|
+
dialect,
|
|
20
|
+
[
|
|
21
|
+
"redshift",
|
|
22
|
+
"mssql",
|
|
23
|
+
# sybase is another one, but we don't support that dialect yet.
|
|
24
|
+
],
|
|
25
|
+
)
|
|
26
|
+
and identifier.name.startswith("#")
|
|
18
27
|
)
|
|
19
28
|
|
|
20
29
|
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Generator, List, Tuple
|
|
4
|
+
|
|
5
|
+
CONTROL_FLOW_KEYWORDS = [
|
|
6
|
+
"GO",
|
|
7
|
+
r"BEGIN\w+TRY",
|
|
8
|
+
r"BEGIN\w+CATCH",
|
|
9
|
+
"BEGIN",
|
|
10
|
+
r"END\w+TRY",
|
|
11
|
+
r"END\w+CATCH",
|
|
12
|
+
"END",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
# There's an exception to this rule, which is when the statement
|
|
16
|
+
# is preceeded by a CTE.
|
|
17
|
+
FORCE_NEW_STATEMENT_KEYWORDS = [
|
|
18
|
+
# SELECT is used inside queries as well, so we can't include it here.
|
|
19
|
+
"INSERT",
|
|
20
|
+
"UPDATE",
|
|
21
|
+
"DELETE",
|
|
22
|
+
"MERGE",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ParserState(Enum):
|
|
27
|
+
NORMAL = 1
|
|
28
|
+
STRING = 2
|
|
29
|
+
COMMENT = 3
|
|
30
|
+
MULTILINE_COMMENT = 4
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if a keyword exists at the given position using regex word boundaries.
|
|
36
|
+
"""
|
|
37
|
+
if pos + len(keyword) > len(sql):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
# If we're not at a word boundary, we can't generate a keyword.
|
|
41
|
+
if pos > 0 and not (
|
|
42
|
+
bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
|
|
43
|
+
or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
|
|
44
|
+
):
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
pattern = rf"^{re.escape(keyword)}\b"
|
|
48
|
+
match = re.match(pattern, sql[pos:], re.IGNORECASE)
|
|
49
|
+
return bool(match)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _look_ahead_for_keywords(
|
|
53
|
+
sql: str, pos: int, keywords: List[str]
|
|
54
|
+
) -> Tuple[bool, str, int]:
|
|
55
|
+
"""
|
|
56
|
+
Look ahead for SQL keywords at the current position.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
for keyword in keywords:
|
|
60
|
+
if _is_keyword_at_position(sql, pos, keyword):
|
|
61
|
+
return True, keyword, len(keyword)
|
|
62
|
+
return False, "", 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def split_statements(sql: str) -> Generator[str, None, None]:
|
|
66
|
+
"""
|
|
67
|
+
Split T-SQL code into individual statements, handling various SQL constructs.
|
|
68
|
+
"""
|
|
69
|
+
if not sql or not sql.strip():
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
current_statement: List[str] = []
|
|
73
|
+
state = ParserState.NORMAL
|
|
74
|
+
i = 0
|
|
75
|
+
|
|
76
|
+
def yield_if_complete() -> Generator[str, None, None]:
|
|
77
|
+
statement = "".join(current_statement).strip()
|
|
78
|
+
if statement:
|
|
79
|
+
yield statement
|
|
80
|
+
current_statement.clear()
|
|
81
|
+
|
|
82
|
+
prev_real_char = "\0" # the most recent non-whitespace, non-comment character
|
|
83
|
+
while i < len(sql):
|
|
84
|
+
c = sql[i]
|
|
85
|
+
next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
|
|
86
|
+
|
|
87
|
+
if state == ParserState.NORMAL:
|
|
88
|
+
if c == "'":
|
|
89
|
+
state = ParserState.STRING
|
|
90
|
+
current_statement.append(c)
|
|
91
|
+
prev_real_char = c
|
|
92
|
+
elif c == "-" and next_char == "-":
|
|
93
|
+
state = ParserState.COMMENT
|
|
94
|
+
current_statement.append(c)
|
|
95
|
+
current_statement.append(next_char)
|
|
96
|
+
i += 1
|
|
97
|
+
elif c == "/" and next_char == "*":
|
|
98
|
+
state = ParserState.MULTILINE_COMMENT
|
|
99
|
+
current_statement.append(c)
|
|
100
|
+
current_statement.append(next_char)
|
|
101
|
+
i += 1
|
|
102
|
+
else:
|
|
103
|
+
most_recent_real_char = prev_real_char
|
|
104
|
+
if not c.isspace():
|
|
105
|
+
prev_real_char = c
|
|
106
|
+
|
|
107
|
+
is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
|
|
108
|
+
sql, i, keywords=CONTROL_FLOW_KEYWORDS
|
|
109
|
+
)
|
|
110
|
+
if is_control_keyword:
|
|
111
|
+
# Yield current statement if any
|
|
112
|
+
yield from yield_if_complete()
|
|
113
|
+
# Yield keyword as its own statement
|
|
114
|
+
yield keyword
|
|
115
|
+
i += keyword_len
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
(
|
|
119
|
+
is_force_new_statement_keyword,
|
|
120
|
+
keyword,
|
|
121
|
+
keyword_len,
|
|
122
|
+
) = _look_ahead_for_keywords(
|
|
123
|
+
sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
|
|
124
|
+
)
|
|
125
|
+
if (
|
|
126
|
+
is_force_new_statement_keyword and most_recent_real_char != ")"
|
|
127
|
+
): # usually we'd have a close paren that closes a CTE
|
|
128
|
+
# Force termination of current statement
|
|
129
|
+
yield from yield_if_complete()
|
|
130
|
+
|
|
131
|
+
current_statement.append(keyword)
|
|
132
|
+
i += keyword_len
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
elif c == ";":
|
|
136
|
+
yield from yield_if_complete()
|
|
137
|
+
else:
|
|
138
|
+
current_statement.append(c)
|
|
139
|
+
|
|
140
|
+
elif state == ParserState.STRING:
|
|
141
|
+
current_statement.append(c)
|
|
142
|
+
if c == "'" and next_char == "'":
|
|
143
|
+
current_statement.append(next_char)
|
|
144
|
+
i += 1
|
|
145
|
+
elif c == "'":
|
|
146
|
+
state = ParserState.NORMAL
|
|
147
|
+
|
|
148
|
+
elif state == ParserState.COMMENT:
|
|
149
|
+
current_statement.append(c)
|
|
150
|
+
if c == "\n":
|
|
151
|
+
state = ParserState.NORMAL
|
|
152
|
+
|
|
153
|
+
elif state == ParserState.MULTILINE_COMMENT:
|
|
154
|
+
current_statement.append(c)
|
|
155
|
+
if c == "*" and next_char == "/":
|
|
156
|
+
current_statement.append(next_char)
|
|
157
|
+
i += 1
|
|
158
|
+
state = ParserState.NORMAL
|
|
159
|
+
|
|
160
|
+
i += 1
|
|
161
|
+
|
|
162
|
+
# Handle the last statement
|
|
163
|
+
yield from yield_if_complete()
|
|
File without changes
|
{acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|