acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2557 -2557
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +34 -30
- datahub/__init__.py +1 -1
- datahub/configuration/kafka_consumer_config.py +4 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
- datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
- datahub/ingestion/source/ge_data_profiler.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +6 -2
- datahub/ingestion/source/redshift/report.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
- datahub/ingestion/source/sql/mssql/job_models.py +1 -0
- datahub/ingestion/source/sql/mssql/source.py +113 -38
- datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
- datahub/ingestion/source/sql/oracle.py +50 -0
- datahub/ingestion/source/sql/sql_common.py +28 -54
- datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
- datahub/ingestion/source/sql/sql_report.py +75 -0
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/sql/vertica.py +2 -2
- datahub/ingestion/source/unity/report.py +2 -2
- datahub/metadata/schema.avsc +1 -1
- datahub/metadata/schemas/AssertionInfo.avsc +1 -1
- datahub/metadata/schemas/InputFields.avsc +1 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
- datahub/sql_parsing/datajob.py +50 -0
- datahub/sql_parsing/query_types.py +10 -1
- datahub/sql_parsing/split_statements.py +163 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
|
@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
|
|
|
24
24
|
platform_name,
|
|
25
25
|
support_status,
|
|
26
26
|
)
|
|
27
|
+
from datahub.ingestion.api.source import StructuredLogLevel
|
|
28
|
+
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
27
29
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
30
|
from datahub.ingestion.source.sql.mssql.job_models import (
|
|
29
31
|
JobStep,
|
|
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
|
|
|
36
38
|
ProcedureParameter,
|
|
37
39
|
StoredProcedure,
|
|
38
40
|
)
|
|
41
|
+
from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
|
|
42
|
+
generate_procedure_lineage,
|
|
43
|
+
)
|
|
39
44
|
from datahub.ingestion.source.sql.sql_common import (
|
|
40
45
|
SQLAlchemySource,
|
|
41
46
|
SqlWorkUnit,
|
|
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
|
|
|
51
56
|
StringTypeClass,
|
|
52
57
|
UnionTypeClass,
|
|
53
58
|
)
|
|
59
|
+
from datahub.utilities.file_backed_collections import FileBackedList
|
|
54
60
|
|
|
55
61
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
56
62
|
|
|
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
99
105
|
default=False,
|
|
100
106
|
description="Enable to convert the SQL Server assets urns to lowercase",
|
|
101
107
|
)
|
|
108
|
+
include_lineage: bool = Field(
|
|
109
|
+
default=True,
|
|
110
|
+
description="Enable lineage extraction for stored procedures",
|
|
111
|
+
)
|
|
102
112
|
|
|
103
113
|
@pydantic.validator("uri_args")
|
|
104
114
|
def passwords_match(cls, v, values, **kwargs):
|
|
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
161
171
|
self.current_database = None
|
|
162
172
|
self.table_descriptions: Dict[str, str] = {}
|
|
163
173
|
self.column_descriptions: Dict[str, str] = {}
|
|
174
|
+
self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
|
|
164
175
|
if self.config.include_descriptions:
|
|
165
176
|
for inspector in self.get_inspectors():
|
|
166
177
|
db_name: str = self.get_db_name(inspector)
|
|
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
374
385
|
def loop_job_steps(
|
|
375
386
|
self, job: MSSQLJob, job_steps: Dict[str, Any]
|
|
376
387
|
) -> Iterable[MetadataWorkUnit]:
|
|
377
|
-
for
|
|
388
|
+
for _step_id, step_data in job_steps.items():
|
|
378
389
|
step = JobStep(
|
|
379
390
|
job_name=job.formatted_name,
|
|
380
391
|
step_name=step_data["step_name"],
|
|
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
412
423
|
if procedures:
|
|
413
424
|
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
414
425
|
for procedure in procedures:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
426
|
+
yield from self._process_stored_procedure(conn, procedure)
|
|
427
|
+
|
|
428
|
+
def _process_stored_procedure(
|
|
429
|
+
self, conn: Connection, procedure: StoredProcedure
|
|
430
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
431
|
+
upstream = self._get_procedure_upstream(conn, procedure)
|
|
432
|
+
downstream = self._get_procedure_downstream(conn, procedure)
|
|
433
|
+
data_job = MSSQLDataJob(
|
|
434
|
+
entity=procedure,
|
|
435
|
+
)
|
|
436
|
+
# TODO: because of this upstream and downstream are more dependencies,
|
|
437
|
+
# can't be used as DataJobInputOutput.
|
|
438
|
+
# Should be reorganized into lineage.
|
|
439
|
+
data_job.add_property("procedure_depends_on", str(upstream.as_property))
|
|
440
|
+
data_job.add_property("depending_on_procedure", str(downstream.as_property))
|
|
441
|
+
procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
|
|
442
|
+
procedure.code = procedure_code
|
|
443
|
+
if procedure_definition:
|
|
444
|
+
data_job.add_property("definition", procedure_definition)
|
|
445
|
+
if procedure_code and self.config.include_stored_procedures_code:
|
|
446
|
+
data_job.add_property("code", procedure_code)
|
|
447
|
+
procedure_inputs = self._get_procedure_inputs(conn, procedure)
|
|
448
|
+
properties = self._get_procedure_properties(conn, procedure)
|
|
449
|
+
data_job.add_property(
|
|
450
|
+
"input parameters", str([param.name for param in procedure_inputs])
|
|
451
|
+
)
|
|
452
|
+
for param in procedure_inputs:
|
|
453
|
+
data_job.add_property(f"parameter {param.name}", str(param.properties))
|
|
454
|
+
for property_name, property_value in properties.items():
|
|
455
|
+
data_job.add_property(property_name, str(property_value))
|
|
456
|
+
if self.config.include_lineage:
|
|
457
|
+
# These will be used to construct lineage
|
|
458
|
+
self.stored_procedures.append(procedure)
|
|
459
|
+
yield from self.construct_job_workunits(
|
|
460
|
+
data_job,
|
|
461
|
+
# For stored procedure lineage is ingested later
|
|
462
|
+
include_lineage=False,
|
|
463
|
+
)
|
|
446
464
|
|
|
447
465
|
@staticmethod
|
|
448
466
|
def _get_procedure_downstream(
|
|
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
546
564
|
code_list.append(row["Text"])
|
|
547
565
|
if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
|
|
548
566
|
code_slice_index = index
|
|
549
|
-
definition = "
|
|
550
|
-
code = "
|
|
567
|
+
definition = "".join(code_list[:code_slice_index])
|
|
568
|
+
code = "".join(code_list[code_slice_index:])
|
|
551
569
|
except ResourceClosedError:
|
|
552
570
|
logger.warning(
|
|
553
571
|
"Connection was closed from procedure '%s'",
|
|
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
602
620
|
def construct_job_workunits(
|
|
603
621
|
self,
|
|
604
622
|
data_job: MSSQLDataJob,
|
|
623
|
+
include_lineage: bool = True,
|
|
605
624
|
) -> Iterable[MetadataWorkUnit]:
|
|
606
625
|
yield MetadataChangeProposalWrapper(
|
|
607
626
|
entityUrn=data_job.urn,
|
|
608
627
|
aspect=data_job.as_datajob_info_aspect,
|
|
609
628
|
).as_workunit()
|
|
610
629
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
630
|
+
if include_lineage:
|
|
631
|
+
yield MetadataChangeProposalWrapper(
|
|
632
|
+
entityUrn=data_job.urn,
|
|
633
|
+
aspect=data_job.as_datajob_input_output_aspect,
|
|
634
|
+
).as_workunit()
|
|
615
635
|
# TODO: Add SubType when it appear
|
|
616
636
|
|
|
617
637
|
def construct_flow_workunits(
|
|
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
664
684
|
if self.config.convert_urns_to_lowercase
|
|
665
685
|
else qualified_table_name
|
|
666
686
|
)
|
|
687
|
+
|
|
688
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
689
|
+
yield from super().get_workunits_internal()
|
|
690
|
+
|
|
691
|
+
# This is done at the end so that we will have access to tables
|
|
692
|
+
# from all databases in schema_resolver and discovered_tables
|
|
693
|
+
for procedure in self.stored_procedures:
|
|
694
|
+
with self.report.report_exc(
|
|
695
|
+
message="Failed to parse stored procedure lineage",
|
|
696
|
+
context=procedure.full_name,
|
|
697
|
+
level=StructuredLogLevel.WARN,
|
|
698
|
+
):
|
|
699
|
+
yield from auto_workunit(
|
|
700
|
+
generate_procedure_lineage(
|
|
701
|
+
schema_resolver=self.schema_resolver,
|
|
702
|
+
procedure=procedure,
|
|
703
|
+
procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
|
|
704
|
+
is_temp_table=self.is_temp_table,
|
|
705
|
+
)
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
def is_temp_table(self, name: str) -> bool:
|
|
709
|
+
try:
|
|
710
|
+
parts = name.split(".")
|
|
711
|
+
table_name = parts[-1]
|
|
712
|
+
schema_name = parts[-2]
|
|
713
|
+
db_name = parts[-3]
|
|
714
|
+
|
|
715
|
+
if table_name.startswith("#"):
|
|
716
|
+
return True
|
|
717
|
+
|
|
718
|
+
# This is also a temp table if
|
|
719
|
+
# 1. this name would be allowed by the dataset patterns, and
|
|
720
|
+
# 2. we have a list of discovered tables, and
|
|
721
|
+
# 3. it's not in the discovered tables list
|
|
722
|
+
if (
|
|
723
|
+
self.config.database_pattern.allowed(db_name)
|
|
724
|
+
and self.config.schema_pattern.allowed(schema_name)
|
|
725
|
+
and self.config.table_pattern.allowed(name)
|
|
726
|
+
and self.standardize_identifier_case(name)
|
|
727
|
+
not in self.discovered_datasets
|
|
728
|
+
):
|
|
729
|
+
logger.debug(f"inferred as temp table {name}")
|
|
730
|
+
return True
|
|
731
|
+
|
|
732
|
+
except Exception:
|
|
733
|
+
logger.warning(f"Error parsing table name {name} ")
|
|
734
|
+
return False
|
|
735
|
+
|
|
736
|
+
def standardize_identifier_case(self, table_ref_str: str) -> str:
|
|
737
|
+
return (
|
|
738
|
+
table_ref_str.lower()
|
|
739
|
+
if self.config.convert_urns_to_lowercase
|
|
740
|
+
else table_ref_str
|
|
741
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Callable, Iterable, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
5
|
+
from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
|
|
6
|
+
from datahub.metadata.schema_classes import DataJobInputOutputClass
|
|
7
|
+
from datahub.sql_parsing.datajob import to_datajob_input_output
|
|
8
|
+
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
9
|
+
from datahub.sql_parsing.split_statements import split_statements
|
|
10
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
11
|
+
ObservedQuery,
|
|
12
|
+
SqlParsingAggregator,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse_procedure_code(
|
|
19
|
+
*,
|
|
20
|
+
schema_resolver: SchemaResolver,
|
|
21
|
+
default_db: Optional[str],
|
|
22
|
+
default_schema: Optional[str],
|
|
23
|
+
code: str,
|
|
24
|
+
is_temp_table: Callable[[str], bool],
|
|
25
|
+
raise_: bool = False,
|
|
26
|
+
) -> Optional[DataJobInputOutputClass]:
|
|
27
|
+
aggregator = SqlParsingAggregator(
|
|
28
|
+
platform=schema_resolver.platform,
|
|
29
|
+
env=schema_resolver.env,
|
|
30
|
+
schema_resolver=schema_resolver,
|
|
31
|
+
generate_lineage=True,
|
|
32
|
+
generate_queries=False,
|
|
33
|
+
generate_usage_statistics=False,
|
|
34
|
+
generate_operations=False,
|
|
35
|
+
generate_query_subject_fields=False,
|
|
36
|
+
generate_query_usage_statistics=False,
|
|
37
|
+
is_temp_table=is_temp_table,
|
|
38
|
+
)
|
|
39
|
+
for query in split_statements(code):
|
|
40
|
+
# TODO: We should take into account `USE x` statements.
|
|
41
|
+
aggregator.add_observed_query(
|
|
42
|
+
observed=ObservedQuery(
|
|
43
|
+
default_db=default_db,
|
|
44
|
+
default_schema=default_schema,
|
|
45
|
+
query=query,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
if aggregator.report.num_observed_queries_failed and raise_:
|
|
49
|
+
logger.info(aggregator.report.as_string())
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
mcps = list(aggregator.gen_metadata())
|
|
55
|
+
return to_datajob_input_output(
|
|
56
|
+
mcps=mcps,
|
|
57
|
+
ignore_extra_mcps=True,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Is procedure handling generic enough to be added to SqlParsingAggregator?
|
|
62
|
+
def generate_procedure_lineage(
|
|
63
|
+
*,
|
|
64
|
+
schema_resolver: SchemaResolver,
|
|
65
|
+
procedure: StoredProcedure,
|
|
66
|
+
procedure_job_urn: str,
|
|
67
|
+
is_temp_table: Callable[[str], bool] = lambda _: False,
|
|
68
|
+
raise_: bool = False,
|
|
69
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
70
|
+
if procedure.code:
|
|
71
|
+
datajob_input_output = parse_procedure_code(
|
|
72
|
+
schema_resolver=schema_resolver,
|
|
73
|
+
default_db=procedure.db,
|
|
74
|
+
default_schema=procedure.schema,
|
|
75
|
+
code=procedure.code,
|
|
76
|
+
is_temp_table=is_temp_table,
|
|
77
|
+
raise_=raise_,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if datajob_input_output:
|
|
81
|
+
yield MetadataChangeProposalWrapper(
|
|
82
|
+
entityUrn=procedure_job_urn,
|
|
83
|
+
aspect=datajob_input_output,
|
|
84
|
+
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
|
|
@@ -631,3 +632,52 @@ class OracleSource(SQLAlchemySource):
|
|
|
631
632
|
clear=False,
|
|
632
633
|
):
|
|
633
634
|
return super().get_workunits()
|
|
635
|
+
|
|
636
|
+
def generate_profile_candidates(
|
|
637
|
+
self,
|
|
638
|
+
inspector: Inspector,
|
|
639
|
+
threshold_time: Optional[datetime.datetime],
|
|
640
|
+
schema: str,
|
|
641
|
+
) -> Optional[List[str]]:
|
|
642
|
+
tables_table_name = (
|
|
643
|
+
"ALL_TABLES" if self.config.data_dictionary_mode == "ALL" else "DBA_TABLES"
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# If stats are available , they are used even if they are stale.
|
|
647
|
+
# Assuming that the table would typically grow over time, this will ensure to filter
|
|
648
|
+
# large tables known at stats collection time from profiling candidates.
|
|
649
|
+
# If stats are not available (NULL), such tables are not filtered and are considered
|
|
650
|
+
# as profiling candidates.
|
|
651
|
+
cursor = inspector.bind.execute(
|
|
652
|
+
sql.text(
|
|
653
|
+
f"""SELECT
|
|
654
|
+
t.OWNER,
|
|
655
|
+
t.TABLE_NAME,
|
|
656
|
+
t.NUM_ROWS,
|
|
657
|
+
t.LAST_ANALYZED,
|
|
658
|
+
COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) AS SIZE_GB
|
|
659
|
+
FROM {tables_table_name} t
|
|
660
|
+
WHERE t.OWNER = :owner
|
|
661
|
+
AND (t.NUM_ROWS < :table_row_limit OR t.NUM_ROWS IS NULL)
|
|
662
|
+
AND COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) < :table_size_limit
|
|
663
|
+
"""
|
|
664
|
+
),
|
|
665
|
+
dict(
|
|
666
|
+
owner=inspector.dialect.denormalize_name(schema),
|
|
667
|
+
table_row_limit=self.config.profiling.profile_table_row_limit,
|
|
668
|
+
table_size_limit=self.config.profiling.profile_table_size_limit,
|
|
669
|
+
),
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
TABLE_NAME_COL_LOC = 1
|
|
673
|
+
return [
|
|
674
|
+
self.get_identifier(
|
|
675
|
+
schema=schema,
|
|
676
|
+
entity=inspector.dialect.normalize_name(row[TABLE_NAME_COL_LOC])
|
|
677
|
+
or _raise_err(
|
|
678
|
+
ValueError(f"Invalid table name: {row[TABLE_NAME_COL_LOC]}")
|
|
679
|
+
),
|
|
680
|
+
inspector=inspector,
|
|
681
|
+
)
|
|
682
|
+
for row in cursor
|
|
683
|
+
]
|
|
@@ -51,7 +51,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
51
51
|
from datahub.ingestion.glossary.classification_mixin import (
|
|
52
52
|
SAMPLE_SIZE_MULTIPLIER,
|
|
53
53
|
ClassificationHandler,
|
|
54
|
-
ClassificationReportMixin,
|
|
55
54
|
)
|
|
56
55
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
57
56
|
from datahub.ingestion.source.common.subtypes import (
|
|
@@ -59,6 +58,7 @@ from datahub.ingestion.source.common.subtypes import (
|
|
|
59
58
|
DatasetSubTypes,
|
|
60
59
|
)
|
|
61
60
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
61
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
62
62
|
from datahub.ingestion.source.sql.sql_utils import (
|
|
63
63
|
add_table_to_schema_container,
|
|
64
64
|
downgrade_schema_from_v2,
|
|
@@ -74,7 +74,6 @@ from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
|
|
|
74
74
|
)
|
|
75
75
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
76
76
|
StaleEntityRemovalHandler,
|
|
77
|
-
StaleEntityRemovalSourceReport,
|
|
78
77
|
)
|
|
79
78
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
80
79
|
StatefulIngestionSourceBase,
|
|
@@ -118,9 +117,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
118
117
|
)
|
|
119
118
|
from datahub.telemetry import telemetry
|
|
120
119
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
121
|
-
from datahub.utilities.lossy_collections import LossyList
|
|
122
120
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
123
|
-
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
|
124
121
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
125
122
|
get_native_data_type_for_sqlalchemy_type,
|
|
126
123
|
)
|
|
@@ -134,43 +131,6 @@ if TYPE_CHECKING:
|
|
|
134
131
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
135
132
|
|
|
136
133
|
|
|
137
|
-
@dataclass
|
|
138
|
-
class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
|
|
139
|
-
tables_scanned: int = 0
|
|
140
|
-
views_scanned: int = 0
|
|
141
|
-
entities_profiled: int = 0
|
|
142
|
-
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
143
|
-
|
|
144
|
-
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
|
145
|
-
|
|
146
|
-
num_view_definitions_parsed: int = 0
|
|
147
|
-
num_view_definitions_failed_parsing: int = 0
|
|
148
|
-
num_view_definitions_failed_column_parsing: int = 0
|
|
149
|
-
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
150
|
-
|
|
151
|
-
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
152
|
-
"""
|
|
153
|
-
Entity could be a view or a table
|
|
154
|
-
"""
|
|
155
|
-
if ent_type == "table":
|
|
156
|
-
self.tables_scanned += 1
|
|
157
|
-
elif ent_type == "view":
|
|
158
|
-
self.views_scanned += 1
|
|
159
|
-
else:
|
|
160
|
-
raise KeyError(f"Unknown entity {ent_type}.")
|
|
161
|
-
|
|
162
|
-
def report_entity_profiled(self, name: str) -> None:
|
|
163
|
-
self.entities_profiled += 1
|
|
164
|
-
|
|
165
|
-
def report_dropped(self, ent_name: str) -> None:
|
|
166
|
-
self.filtered.append(ent_name)
|
|
167
|
-
|
|
168
|
-
def report_from_query_combiner(
|
|
169
|
-
self, query_combiner_report: SQLAlchemyQueryCombinerReport
|
|
170
|
-
) -> None:
|
|
171
|
-
self.query_combiner = query_combiner_report
|
|
172
|
-
|
|
173
|
-
|
|
174
134
|
class SqlWorkUnit(MetadataWorkUnit):
|
|
175
135
|
pass
|
|
176
136
|
|
|
@@ -352,7 +312,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
352
312
|
|
|
353
313
|
def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str):
|
|
354
314
|
super().__init__(config, ctx)
|
|
355
|
-
self.config = config
|
|
315
|
+
self.config: SQLCommonConfig = config
|
|
356
316
|
self.platform = platform
|
|
357
317
|
self.report: SQLSourceReport = SQLSourceReport()
|
|
358
318
|
self.profile_metadata_info: ProfileMetadata = ProfileMetadata()
|
|
@@ -392,6 +352,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
392
352
|
platform_instance=self.config.platform_instance,
|
|
393
353
|
env=self.config.env,
|
|
394
354
|
)
|
|
355
|
+
self.discovered_datasets: Set[str] = set()
|
|
395
356
|
self._view_definition_cache: MutableMapping[str, str]
|
|
396
357
|
if self.config.use_file_backed_cache:
|
|
397
358
|
self._view_definition_cache = FileBackedDict[str]()
|
|
@@ -831,8 +792,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
831
792
|
self._classify(dataset_name, schema, table, data_reader, schema_metadata)
|
|
832
793
|
|
|
833
794
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
834
|
-
if self.
|
|
795
|
+
if self._save_schema_to_resolver():
|
|
835
796
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
797
|
+
self.discovered_datasets.add(dataset_name)
|
|
836
798
|
db_name = self.get_db_name(inspector)
|
|
837
799
|
|
|
838
800
|
yield from self.add_table_to_schema_container(
|
|
@@ -1126,8 +1088,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1126
1088
|
columns,
|
|
1127
1089
|
canonical_schema=schema_fields,
|
|
1128
1090
|
)
|
|
1129
|
-
if self.
|
|
1091
|
+
if self._save_schema_to_resolver():
|
|
1130
1092
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
1093
|
+
self.discovered_datasets.add(dataset_name)
|
|
1131
1094
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
1132
1095
|
try:
|
|
1133
1096
|
view_definition = inspector.get_view_definition(view, schema)
|
|
@@ -1190,6 +1153,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1190
1153
|
domain_registry=self.domain_registry,
|
|
1191
1154
|
)
|
|
1192
1155
|
|
|
1156
|
+
def _save_schema_to_resolver(self):
|
|
1157
|
+
return self.config.include_view_lineage or (
|
|
1158
|
+
hasattr(self.config, "include_lineage") and self.config.include_lineage
|
|
1159
|
+
)
|
|
1160
|
+
|
|
1193
1161
|
def _run_sql_parser(
|
|
1194
1162
|
self, view_identifier: str, query: str, schema_resolver: SchemaResolver
|
|
1195
1163
|
) -> Optional[SqlParsingResult]:
|
|
@@ -1274,17 +1242,22 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1274
1242
|
def is_dataset_eligible_for_profiling(
|
|
1275
1243
|
self,
|
|
1276
1244
|
dataset_name: str,
|
|
1277
|
-
|
|
1245
|
+
schema: str,
|
|
1278
1246
|
inspector: Inspector,
|
|
1279
1247
|
profile_candidates: Optional[List[str]],
|
|
1280
1248
|
) -> bool:
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
and
|
|
1284
|
-
)
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1249
|
+
if not (
|
|
1250
|
+
self.config.table_pattern.allowed(dataset_name)
|
|
1251
|
+
and self.config.profile_pattern.allowed(dataset_name)
|
|
1252
|
+
):
|
|
1253
|
+
self.report.profiling_skipped_table_profile_pattern[schema] += 1
|
|
1254
|
+
return False
|
|
1255
|
+
|
|
1256
|
+
if profile_candidates is not None and dataset_name not in profile_candidates:
|
|
1257
|
+
self.report.profiling_skipped_other[schema] += 1
|
|
1258
|
+
return False
|
|
1259
|
+
|
|
1260
|
+
return True
|
|
1288
1261
|
|
|
1289
1262
|
def loop_profiler_requests(
|
|
1290
1263
|
self,
|
|
@@ -1299,7 +1272,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1299
1272
|
if (
|
|
1300
1273
|
sql_config.profiling.profile_if_updated_since_days is not None
|
|
1301
1274
|
or sql_config.profiling.profile_table_size_limit is not None
|
|
1302
|
-
or sql_config.profiling.profile_table_row_limit is None
|
|
1275
|
+
or sql_config.profiling.profile_table_row_limit is not None
|
|
1303
1276
|
):
|
|
1304
1277
|
try:
|
|
1305
1278
|
threshold_time: Optional[datetime.datetime] = None
|
|
@@ -1320,8 +1293,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1320
1293
|
schema=schema, entity=table, inspector=inspector
|
|
1321
1294
|
)
|
|
1322
1295
|
if not self.is_dataset_eligible_for_profiling(
|
|
1323
|
-
dataset_name,
|
|
1296
|
+
dataset_name, schema, inspector, profile_candidates
|
|
1324
1297
|
):
|
|
1298
|
+
self.report.num_tables_not_eligible_profiling[schema] += 1
|
|
1325
1299
|
if self.config.profiling.report_dropped_profiles:
|
|
1326
1300
|
self.report.report_dropped(f"profile of {dataset_name}")
|
|
1327
1301
|
continue
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import abstractmethod
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime, timedelta, timezone
|
|
5
5
|
from typing import Dict, Iterable, List, Optional, Union, cast
|
|
6
6
|
|
|
@@ -14,42 +14,13 @@ from datahub.ingestion.source.ge_data_profiler import (
|
|
|
14
14
|
DatahubGEProfiler,
|
|
15
15
|
GEProfilerRequest,
|
|
16
16
|
)
|
|
17
|
-
from datahub.ingestion.source.sql.sql_common import SQLSourceReport
|
|
18
17
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
19
18
|
from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView
|
|
19
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
20
20
|
from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern
|
|
21
21
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
22
22
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
|
|
23
23
|
from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType
|
|
24
|
-
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class DetailedProfilerReportMixin:
|
|
29
|
-
profiling_skipped_not_updated: TopKDict[str, int] = field(
|
|
30
|
-
default_factory=int_top_k_dict
|
|
31
|
-
)
|
|
32
|
-
profiling_skipped_size_limit: TopKDict[str, int] = field(
|
|
33
|
-
default_factory=int_top_k_dict
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
profiling_skipped_row_limit: TopKDict[str, int] = field(
|
|
37
|
-
default_factory=int_top_k_dict
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
41
|
-
default_factory=int_top_k_dict
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
45
|
-
|
|
46
|
-
num_tables_not_eligible_profiling: Dict[str, int] = field(
|
|
47
|
-
default_factory=int_top_k_dict
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport):
|
|
52
|
-
pass
|
|
53
24
|
|
|
54
25
|
|
|
55
26
|
@dataclass
|
|
@@ -65,7 +36,7 @@ class GenericProfiler:
|
|
|
65
36
|
def __init__(
|
|
66
37
|
self,
|
|
67
38
|
config: SQLCommonConfig,
|
|
68
|
-
report:
|
|
39
|
+
report: SQLSourceReport,
|
|
69
40
|
platform: str,
|
|
70
41
|
state_handler: Optional[ProfilingHandler] = None,
|
|
71
42
|
) -> None:
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
5
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
|
+
StaleEntityRemovalSourceReport,
|
|
7
|
+
)
|
|
8
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
9
|
+
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
|
10
|
+
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DetailedProfilerReportMixin:
|
|
15
|
+
profiling_skipped_not_updated: TopKDict[str, int] = field(
|
|
16
|
+
default_factory=int_top_k_dict
|
|
17
|
+
)
|
|
18
|
+
profiling_skipped_size_limit: TopKDict[str, int] = field(
|
|
19
|
+
default_factory=int_top_k_dict
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
profiling_skipped_row_limit: TopKDict[str, int] = field(
|
|
23
|
+
default_factory=int_top_k_dict
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
27
|
+
default_factory=int_top_k_dict
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
31
|
+
|
|
32
|
+
num_tables_not_eligible_profiling: Dict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SQLSourceReport(
|
|
39
|
+
StaleEntityRemovalSourceReport,
|
|
40
|
+
ClassificationReportMixin,
|
|
41
|
+
DetailedProfilerReportMixin,
|
|
42
|
+
):
|
|
43
|
+
tables_scanned: int = 0
|
|
44
|
+
views_scanned: int = 0
|
|
45
|
+
entities_profiled: int = 0
|
|
46
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
47
|
+
|
|
48
|
+
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
|
49
|
+
|
|
50
|
+
num_view_definitions_parsed: int = 0
|
|
51
|
+
num_view_definitions_failed_parsing: int = 0
|
|
52
|
+
num_view_definitions_failed_column_parsing: int = 0
|
|
53
|
+
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
54
|
+
|
|
55
|
+
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
56
|
+
"""
|
|
57
|
+
Entity could be a view or a table
|
|
58
|
+
"""
|
|
59
|
+
if ent_type == "table":
|
|
60
|
+
self.tables_scanned += 1
|
|
61
|
+
elif ent_type == "view":
|
|
62
|
+
self.views_scanned += 1
|
|
63
|
+
else:
|
|
64
|
+
raise KeyError(f"Unknown entity {ent_type}.")
|
|
65
|
+
|
|
66
|
+
def report_entity_profiled(self, name: str) -> None:
|
|
67
|
+
self.entities_profiled += 1
|
|
68
|
+
|
|
69
|
+
def report_dropped(self, ent_name: str) -> None:
|
|
70
|
+
self.filtered.append(ent_name)
|
|
71
|
+
|
|
72
|
+
def report_from_query_combiner(
|
|
73
|
+
self, query_combiner_report: SQLAlchemyQueryCombinerReport
|
|
74
|
+
) -> None:
|
|
75
|
+
self.query_combiner = query_combiner_report
|