acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show
  1. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2557 -2557
  2. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +34 -30
  3. datahub/__init__.py +1 -1
  4. datahub/configuration/kafka_consumer_config.py +4 -1
  5. datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
  6. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
  8. datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
  9. datahub/ingestion/source/ge_data_profiler.py +1 -1
  10. datahub/ingestion/source/ge_profiling_config.py +6 -2
  11. datahub/ingestion/source/redshift/report.py +2 -2
  12. datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
  13. datahub/ingestion/source/sql/mssql/job_models.py +1 -0
  14. datahub/ingestion/source/sql/mssql/source.py +113 -38
  15. datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
  16. datahub/ingestion/source/sql/oracle.py +50 -0
  17. datahub/ingestion/source/sql/sql_common.py +28 -54
  18. datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
  19. datahub/ingestion/source/sql/sql_report.py +75 -0
  20. datahub/ingestion/source/sql/teradata.py +2 -2
  21. datahub/ingestion/source/sql/vertica.py +2 -2
  22. datahub/ingestion/source/unity/report.py +2 -2
  23. datahub/metadata/schema.avsc +1 -1
  24. datahub/metadata/schemas/AssertionInfo.avsc +1 -1
  25. datahub/metadata/schemas/InputFields.avsc +1 -1
  26. datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
  27. datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
  28. datahub/sql_parsing/datajob.py +50 -0
  29. datahub/sql_parsing/query_types.py +10 -1
  30. datahub/sql_parsing/split_statements.py +163 -0
  31. datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
  32. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
  33. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
  34. {acryl_datahub-0.14.1.13rc4.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
24
24
  platform_name,
25
25
  support_status,
26
26
  )
27
+ from datahub.ingestion.api.source import StructuredLogLevel
28
+ from datahub.ingestion.api.source_helpers import auto_workunit
27
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
30
  from datahub.ingestion.source.sql.mssql.job_models import (
29
31
  JobStep,
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
36
38
  ProcedureParameter,
37
39
  StoredProcedure,
38
40
  )
41
+ from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
42
+ generate_procedure_lineage,
43
+ )
39
44
  from datahub.ingestion.source.sql.sql_common import (
40
45
  SQLAlchemySource,
41
46
  SqlWorkUnit,
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
51
56
  StringTypeClass,
52
57
  UnionTypeClass,
53
58
  )
59
+ from datahub.utilities.file_backed_collections import FileBackedList
54
60
 
55
61
  logger: logging.Logger = logging.getLogger(__name__)
56
62
 
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
99
105
  default=False,
100
106
  description="Enable to convert the SQL Server assets urns to lowercase",
101
107
  )
108
+ include_lineage: bool = Field(
109
+ default=True,
110
+ description="Enable lineage extraction for stored procedures",
111
+ )
102
112
 
103
113
  @pydantic.validator("uri_args")
104
114
  def passwords_match(cls, v, values, **kwargs):
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
161
171
  self.current_database = None
162
172
  self.table_descriptions: Dict[str, str] = {}
163
173
  self.column_descriptions: Dict[str, str] = {}
174
+ self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
164
175
  if self.config.include_descriptions:
165
176
  for inspector in self.get_inspectors():
166
177
  db_name: str = self.get_db_name(inspector)
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
374
385
  def loop_job_steps(
375
386
  self, job: MSSQLJob, job_steps: Dict[str, Any]
376
387
  ) -> Iterable[MetadataWorkUnit]:
377
- for step_id, step_data in job_steps.items():
388
+ for _step_id, step_data in job_steps.items():
378
389
  step = JobStep(
379
390
  job_name=job.formatted_name,
380
391
  step_name=step_data["step_name"],
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
412
423
  if procedures:
413
424
  yield from self.construct_flow_workunits(data_flow=data_flow)
414
425
  for procedure in procedures:
415
- upstream = self._get_procedure_upstream(conn, procedure)
416
- downstream = self._get_procedure_downstream(conn, procedure)
417
- data_job = MSSQLDataJob(
418
- entity=procedure,
419
- )
420
- # TODO: because of this upstream and downstream are more dependencies,
421
- # can't be used as DataJobInputOutput.
422
- # Should be reorganized into lineage.
423
- data_job.add_property("procedure_depends_on", str(upstream.as_property))
424
- data_job.add_property(
425
- "depending_on_procedure", str(downstream.as_property)
426
- )
427
- procedure_definition, procedure_code = self._get_procedure_code(
428
- conn, procedure
429
- )
430
- if procedure_definition:
431
- data_job.add_property("definition", procedure_definition)
432
- if sql_config.include_stored_procedures_code and procedure_code:
433
- data_job.add_property("code", procedure_code)
434
- procedure_inputs = self._get_procedure_inputs(conn, procedure)
435
- properties = self._get_procedure_properties(conn, procedure)
436
- data_job.add_property(
437
- "input parameters", str([param.name for param in procedure_inputs])
438
- )
439
- for param in procedure_inputs:
440
- data_job.add_property(
441
- f"parameter {param.name}", str(param.properties)
442
- )
443
- for property_name, property_value in properties.items():
444
- data_job.add_property(property_name, str(property_value))
445
- yield from self.construct_job_workunits(data_job)
426
+ yield from self._process_stored_procedure(conn, procedure)
427
+
428
+ def _process_stored_procedure(
429
+ self, conn: Connection, procedure: StoredProcedure
430
+ ) -> Iterable[MetadataWorkUnit]:
431
+ upstream = self._get_procedure_upstream(conn, procedure)
432
+ downstream = self._get_procedure_downstream(conn, procedure)
433
+ data_job = MSSQLDataJob(
434
+ entity=procedure,
435
+ )
436
+ # TODO: because of this upstream and downstream are more dependencies,
437
+ # can't be used as DataJobInputOutput.
438
+ # Should be reorganized into lineage.
439
+ data_job.add_property("procedure_depends_on", str(upstream.as_property))
440
+ data_job.add_property("depending_on_procedure", str(downstream.as_property))
441
+ procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
442
+ procedure.code = procedure_code
443
+ if procedure_definition:
444
+ data_job.add_property("definition", procedure_definition)
445
+ if procedure_code and self.config.include_stored_procedures_code:
446
+ data_job.add_property("code", procedure_code)
447
+ procedure_inputs = self._get_procedure_inputs(conn, procedure)
448
+ properties = self._get_procedure_properties(conn, procedure)
449
+ data_job.add_property(
450
+ "input parameters", str([param.name for param in procedure_inputs])
451
+ )
452
+ for param in procedure_inputs:
453
+ data_job.add_property(f"parameter {param.name}", str(param.properties))
454
+ for property_name, property_value in properties.items():
455
+ data_job.add_property(property_name, str(property_value))
456
+ if self.config.include_lineage:
457
+ # These will be used to construct lineage
458
+ self.stored_procedures.append(procedure)
459
+ yield from self.construct_job_workunits(
460
+ data_job,
461
+ # For stored procedure lineage is ingested later
462
+ include_lineage=False,
463
+ )
446
464
 
447
465
  @staticmethod
448
466
  def _get_procedure_downstream(
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
546
564
  code_list.append(row["Text"])
547
565
  if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
548
566
  code_slice_index = index
549
- definition = "\n".join(code_list[:code_slice_index])
550
- code = "\n".join(code_list[code_slice_index:])
567
+ definition = "".join(code_list[:code_slice_index])
568
+ code = "".join(code_list[code_slice_index:])
551
569
  except ResourceClosedError:
552
570
  logger.warning(
553
571
  "Connection was closed from procedure '%s'",
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
602
620
  def construct_job_workunits(
603
621
  self,
604
622
  data_job: MSSQLDataJob,
623
+ include_lineage: bool = True,
605
624
  ) -> Iterable[MetadataWorkUnit]:
606
625
  yield MetadataChangeProposalWrapper(
607
626
  entityUrn=data_job.urn,
608
627
  aspect=data_job.as_datajob_info_aspect,
609
628
  ).as_workunit()
610
629
 
611
- yield MetadataChangeProposalWrapper(
612
- entityUrn=data_job.urn,
613
- aspect=data_job.as_datajob_input_output_aspect,
614
- ).as_workunit()
630
+ if include_lineage:
631
+ yield MetadataChangeProposalWrapper(
632
+ entityUrn=data_job.urn,
633
+ aspect=data_job.as_datajob_input_output_aspect,
634
+ ).as_workunit()
615
635
  # TODO: Add SubType when it appear
616
636
 
617
637
  def construct_flow_workunits(
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
664
684
  if self.config.convert_urns_to_lowercase
665
685
  else qualified_table_name
666
686
  )
687
+
688
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
689
+ yield from super().get_workunits_internal()
690
+
691
+ # This is done at the end so that we will have access to tables
692
+ # from all databases in schema_resolver and discovered_tables
693
+ for procedure in self.stored_procedures:
694
+ with self.report.report_exc(
695
+ message="Failed to parse stored procedure lineage",
696
+ context=procedure.full_name,
697
+ level=StructuredLogLevel.WARN,
698
+ ):
699
+ yield from auto_workunit(
700
+ generate_procedure_lineage(
701
+ schema_resolver=self.schema_resolver,
702
+ procedure=procedure,
703
+ procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
704
+ is_temp_table=self.is_temp_table,
705
+ )
706
+ )
707
+
708
+ def is_temp_table(self, name: str) -> bool:
709
+ try:
710
+ parts = name.split(".")
711
+ table_name = parts[-1]
712
+ schema_name = parts[-2]
713
+ db_name = parts[-3]
714
+
715
+ if table_name.startswith("#"):
716
+ return True
717
+
718
+ # This is also a temp table if
719
+ # 1. this name would be allowed by the dataset patterns, and
720
+ # 2. we have a list of discovered tables, and
721
+ # 3. it's not in the discovered tables list
722
+ if (
723
+ self.config.database_pattern.allowed(db_name)
724
+ and self.config.schema_pattern.allowed(schema_name)
725
+ and self.config.table_pattern.allowed(name)
726
+ and self.standardize_identifier_case(name)
727
+ not in self.discovered_datasets
728
+ ):
729
+ logger.debug(f"inferred as temp table {name}")
730
+ return True
731
+
732
+ except Exception:
733
+ logger.warning(f"Error parsing table name {name} ")
734
+ return False
735
+
736
+ def standardize_identifier_case(self, table_ref_str: str) -> str:
737
+ return (
738
+ table_ref_str.lower()
739
+ if self.config.convert_urns_to_lowercase
740
+ else table_ref_str
741
+ )
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from typing import Callable, Iterable, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
6
+ from datahub.metadata.schema_classes import DataJobInputOutputClass
7
+ from datahub.sql_parsing.datajob import to_datajob_input_output
8
+ from datahub.sql_parsing.schema_resolver import SchemaResolver
9
+ from datahub.sql_parsing.split_statements import split_statements
10
+ from datahub.sql_parsing.sql_parsing_aggregator import (
11
+ ObservedQuery,
12
+ SqlParsingAggregator,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def parse_procedure_code(
19
+ *,
20
+ schema_resolver: SchemaResolver,
21
+ default_db: Optional[str],
22
+ default_schema: Optional[str],
23
+ code: str,
24
+ is_temp_table: Callable[[str], bool],
25
+ raise_: bool = False,
26
+ ) -> Optional[DataJobInputOutputClass]:
27
+ aggregator = SqlParsingAggregator(
28
+ platform=schema_resolver.platform,
29
+ env=schema_resolver.env,
30
+ schema_resolver=schema_resolver,
31
+ generate_lineage=True,
32
+ generate_queries=False,
33
+ generate_usage_statistics=False,
34
+ generate_operations=False,
35
+ generate_query_subject_fields=False,
36
+ generate_query_usage_statistics=False,
37
+ is_temp_table=is_temp_table,
38
+ )
39
+ for query in split_statements(code):
40
+ # TODO: We should take into account `USE x` statements.
41
+ aggregator.add_observed_query(
42
+ observed=ObservedQuery(
43
+ default_db=default_db,
44
+ default_schema=default_schema,
45
+ query=query,
46
+ )
47
+ )
48
+ if aggregator.report.num_observed_queries_failed and raise_:
49
+ logger.info(aggregator.report.as_string())
50
+ raise ValueError(
51
+ f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
52
+ )
53
+
54
+ mcps = list(aggregator.gen_metadata())
55
+ return to_datajob_input_output(
56
+ mcps=mcps,
57
+ ignore_extra_mcps=True,
58
+ )
59
+
60
+
61
+ # Is procedure handling generic enough to be added to SqlParsingAggregator?
62
+ def generate_procedure_lineage(
63
+ *,
64
+ schema_resolver: SchemaResolver,
65
+ procedure: StoredProcedure,
66
+ procedure_job_urn: str,
67
+ is_temp_table: Callable[[str], bool] = lambda _: False,
68
+ raise_: bool = False,
69
+ ) -> Iterable[MetadataChangeProposalWrapper]:
70
+ if procedure.code:
71
+ datajob_input_output = parse_procedure_code(
72
+ schema_resolver=schema_resolver,
73
+ default_db=procedure.db,
74
+ default_schema=procedure.schema,
75
+ code=procedure.code,
76
+ is_temp_table=is_temp_table,
77
+ raise_=raise_,
78
+ )
79
+
80
+ if datajob_input_output:
81
+ yield MetadataChangeProposalWrapper(
82
+ entityUrn=procedure_job_urn,
83
+ aspect=datajob_input_output,
84
+ )
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import re
3
4
 
@@ -631,3 +632,52 @@ class OracleSource(SQLAlchemySource):
631
632
  clear=False,
632
633
  ):
633
634
  return super().get_workunits()
635
+
636
+ def generate_profile_candidates(
637
+ self,
638
+ inspector: Inspector,
639
+ threshold_time: Optional[datetime.datetime],
640
+ schema: str,
641
+ ) -> Optional[List[str]]:
642
+ tables_table_name = (
643
+ "ALL_TABLES" if self.config.data_dictionary_mode == "ALL" else "DBA_TABLES"
644
+ )
645
+
646
+ # If stats are available , they are used even if they are stale.
647
+ # Assuming that the table would typically grow over time, this will ensure to filter
648
+ # large tables known at stats collection time from profiling candidates.
649
+ # If stats are not available (NULL), such tables are not filtered and are considered
650
+ # as profiling candidates.
651
+ cursor = inspector.bind.execute(
652
+ sql.text(
653
+ f"""SELECT
654
+ t.OWNER,
655
+ t.TABLE_NAME,
656
+ t.NUM_ROWS,
657
+ t.LAST_ANALYZED,
658
+ COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) AS SIZE_GB
659
+ FROM {tables_table_name} t
660
+ WHERE t.OWNER = :owner
661
+ AND (t.NUM_ROWS < :table_row_limit OR t.NUM_ROWS IS NULL)
662
+ AND COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) < :table_size_limit
663
+ """
664
+ ),
665
+ dict(
666
+ owner=inspector.dialect.denormalize_name(schema),
667
+ table_row_limit=self.config.profiling.profile_table_row_limit,
668
+ table_size_limit=self.config.profiling.profile_table_size_limit,
669
+ ),
670
+ )
671
+
672
+ TABLE_NAME_COL_LOC = 1
673
+ return [
674
+ self.get_identifier(
675
+ schema=schema,
676
+ entity=inspector.dialect.normalize_name(row[TABLE_NAME_COL_LOC])
677
+ or _raise_err(
678
+ ValueError(f"Invalid table name: {row[TABLE_NAME_COL_LOC]}")
679
+ ),
680
+ inspector=inspector,
681
+ )
682
+ for row in cursor
683
+ ]
@@ -51,7 +51,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
51
51
  from datahub.ingestion.glossary.classification_mixin import (
52
52
  SAMPLE_SIZE_MULTIPLIER,
53
53
  ClassificationHandler,
54
- ClassificationReportMixin,
55
54
  )
56
55
  from datahub.ingestion.source.common.data_reader import DataReader
57
56
  from datahub.ingestion.source.common.subtypes import (
@@ -59,6 +58,7 @@ from datahub.ingestion.source.common.subtypes import (
59
58
  DatasetSubTypes,
60
59
  )
61
60
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
61
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
62
62
  from datahub.ingestion.source.sql.sql_utils import (
63
63
  add_table_to_schema_container,
64
64
  downgrade_schema_from_v2,
@@ -74,7 +74,6 @@ from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
74
74
  )
75
75
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
76
76
  StaleEntityRemovalHandler,
77
- StaleEntityRemovalSourceReport,
78
77
  )
79
78
  from datahub.ingestion.source.state.stateful_ingestion_base import (
80
79
  StatefulIngestionSourceBase,
@@ -118,9 +117,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
118
117
  )
119
118
  from datahub.telemetry import telemetry
120
119
  from datahub.utilities.file_backed_collections import FileBackedDict
121
- from datahub.utilities.lossy_collections import LossyList
122
120
  from datahub.utilities.registries.domain_registry import DomainRegistry
123
- from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
124
121
  from datahub.utilities.sqlalchemy_type_converter import (
125
122
  get_native_data_type_for_sqlalchemy_type,
126
123
  )
@@ -134,43 +131,6 @@ if TYPE_CHECKING:
134
131
  logger: logging.Logger = logging.getLogger(__name__)
135
132
 
136
133
 
137
- @dataclass
138
- class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
139
- tables_scanned: int = 0
140
- views_scanned: int = 0
141
- entities_profiled: int = 0
142
- filtered: LossyList[str] = field(default_factory=LossyList)
143
-
144
- query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
145
-
146
- num_view_definitions_parsed: int = 0
147
- num_view_definitions_failed_parsing: int = 0
148
- num_view_definitions_failed_column_parsing: int = 0
149
- view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
150
-
151
- def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
152
- """
153
- Entity could be a view or a table
154
- """
155
- if ent_type == "table":
156
- self.tables_scanned += 1
157
- elif ent_type == "view":
158
- self.views_scanned += 1
159
- else:
160
- raise KeyError(f"Unknown entity {ent_type}.")
161
-
162
- def report_entity_profiled(self, name: str) -> None:
163
- self.entities_profiled += 1
164
-
165
- def report_dropped(self, ent_name: str) -> None:
166
- self.filtered.append(ent_name)
167
-
168
- def report_from_query_combiner(
169
- self, query_combiner_report: SQLAlchemyQueryCombinerReport
170
- ) -> None:
171
- self.query_combiner = query_combiner_report
172
-
173
-
174
134
  class SqlWorkUnit(MetadataWorkUnit):
175
135
  pass
176
136
 
@@ -352,7 +312,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
352
312
 
353
313
  def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str):
354
314
  super().__init__(config, ctx)
355
- self.config = config
315
+ self.config: SQLCommonConfig = config
356
316
  self.platform = platform
357
317
  self.report: SQLSourceReport = SQLSourceReport()
358
318
  self.profile_metadata_info: ProfileMetadata = ProfileMetadata()
@@ -392,6 +352,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
392
352
  platform_instance=self.config.platform_instance,
393
353
  env=self.config.env,
394
354
  )
355
+ self.discovered_datasets: Set[str] = set()
395
356
  self._view_definition_cache: MutableMapping[str, str]
396
357
  if self.config.use_file_backed_cache:
397
358
  self._view_definition_cache = FileBackedDict[str]()
@@ -831,8 +792,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
831
792
  self._classify(dataset_name, schema, table, data_reader, schema_metadata)
832
793
 
833
794
  dataset_snapshot.aspects.append(schema_metadata)
834
- if self.config.include_view_lineage:
795
+ if self._save_schema_to_resolver():
835
796
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
797
+ self.discovered_datasets.add(dataset_name)
836
798
  db_name = self.get_db_name(inspector)
837
799
 
838
800
  yield from self.add_table_to_schema_container(
@@ -1126,8 +1088,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1126
1088
  columns,
1127
1089
  canonical_schema=schema_fields,
1128
1090
  )
1129
- if self.config.include_view_lineage:
1091
+ if self._save_schema_to_resolver():
1130
1092
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1093
+ self.discovered_datasets.add(dataset_name)
1131
1094
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1132
1095
  try:
1133
1096
  view_definition = inspector.get_view_definition(view, schema)
@@ -1190,6 +1153,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1190
1153
  domain_registry=self.domain_registry,
1191
1154
  )
1192
1155
 
1156
+ def _save_schema_to_resolver(self):
1157
+ return self.config.include_view_lineage or (
1158
+ hasattr(self.config, "include_lineage") and self.config.include_lineage
1159
+ )
1160
+
1193
1161
  def _run_sql_parser(
1194
1162
  self, view_identifier: str, query: str, schema_resolver: SchemaResolver
1195
1163
  ) -> Optional[SqlParsingResult]:
@@ -1274,17 +1242,22 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1274
1242
  def is_dataset_eligible_for_profiling(
1275
1243
  self,
1276
1244
  dataset_name: str,
1277
- sql_config: SQLCommonConfig,
1245
+ schema: str,
1278
1246
  inspector: Inspector,
1279
1247
  profile_candidates: Optional[List[str]],
1280
1248
  ) -> bool:
1281
- return (
1282
- sql_config.table_pattern.allowed(dataset_name)
1283
- and sql_config.profile_pattern.allowed(dataset_name)
1284
- ) and (
1285
- profile_candidates is None
1286
- or (profile_candidates is not None and dataset_name in profile_candidates)
1287
- )
1249
+ if not (
1250
+ self.config.table_pattern.allowed(dataset_name)
1251
+ and self.config.profile_pattern.allowed(dataset_name)
1252
+ ):
1253
+ self.report.profiling_skipped_table_profile_pattern[schema] += 1
1254
+ return False
1255
+
1256
+ if profile_candidates is not None and dataset_name not in profile_candidates:
1257
+ self.report.profiling_skipped_other[schema] += 1
1258
+ return False
1259
+
1260
+ return True
1288
1261
 
1289
1262
  def loop_profiler_requests(
1290
1263
  self,
@@ -1299,7 +1272,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1299
1272
  if (
1300
1273
  sql_config.profiling.profile_if_updated_since_days is not None
1301
1274
  or sql_config.profiling.profile_table_size_limit is not None
1302
- or sql_config.profiling.profile_table_row_limit is None
1275
+ or sql_config.profiling.profile_table_row_limit is not None
1303
1276
  ):
1304
1277
  try:
1305
1278
  threshold_time: Optional[datetime.datetime] = None
@@ -1320,8 +1293,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1320
1293
  schema=schema, entity=table, inspector=inspector
1321
1294
  )
1322
1295
  if not self.is_dataset_eligible_for_profiling(
1323
- dataset_name, sql_config, inspector, profile_candidates
1296
+ dataset_name, schema, inspector, profile_candidates
1324
1297
  ):
1298
+ self.report.num_tables_not_eligible_profiling[schema] += 1
1325
1299
  if self.config.profiling.report_dropped_profiles:
1326
1300
  self.report.report_dropped(f"profile of {dataset_name}")
1327
1301
  continue
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from abc import abstractmethod
3
- from dataclasses import dataclass, field
3
+ from dataclasses import dataclass
4
4
  from datetime import datetime, timedelta, timezone
5
5
  from typing import Dict, Iterable, List, Optional, Union, cast
6
6
 
@@ -14,42 +14,13 @@ from datahub.ingestion.source.ge_data_profiler import (
14
14
  DatahubGEProfiler,
15
15
  GEProfilerRequest,
16
16
  )
17
- from datahub.ingestion.source.sql.sql_common import SQLSourceReport
18
17
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
19
18
  from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView
19
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
20
20
  from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern
21
21
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
22
22
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType
24
- from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
25
-
26
-
27
- @dataclass
28
- class DetailedProfilerReportMixin:
29
- profiling_skipped_not_updated: TopKDict[str, int] = field(
30
- default_factory=int_top_k_dict
31
- )
32
- profiling_skipped_size_limit: TopKDict[str, int] = field(
33
- default_factory=int_top_k_dict
34
- )
35
-
36
- profiling_skipped_row_limit: TopKDict[str, int] = field(
37
- default_factory=int_top_k_dict
38
- )
39
-
40
- profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
41
- default_factory=int_top_k_dict
42
- )
43
-
44
- profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
45
-
46
- num_tables_not_eligible_profiling: Dict[str, int] = field(
47
- default_factory=int_top_k_dict
48
- )
49
-
50
-
51
- class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport):
52
- pass
53
24
 
54
25
 
55
26
  @dataclass
@@ -65,7 +36,7 @@ class GenericProfiler:
65
36
  def __init__(
66
37
  self,
67
38
  config: SQLCommonConfig,
68
- report: ProfilingSqlReport,
39
+ report: SQLSourceReport,
69
40
  platform: str,
70
41
  state_handler: Optional[ProfilingHandler] = None,
71
42
  ) -> None:
@@ -0,0 +1,75 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, Optional
3
+
4
+ from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
5
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
+ StaleEntityRemovalSourceReport,
7
+ )
8
+ from datahub.utilities.lossy_collections import LossyList
9
+ from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
10
+ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
11
+
12
+
13
+ @dataclass
14
+ class DetailedProfilerReportMixin:
15
+ profiling_skipped_not_updated: TopKDict[str, int] = field(
16
+ default_factory=int_top_k_dict
17
+ )
18
+ profiling_skipped_size_limit: TopKDict[str, int] = field(
19
+ default_factory=int_top_k_dict
20
+ )
21
+
22
+ profiling_skipped_row_limit: TopKDict[str, int] = field(
23
+ default_factory=int_top_k_dict
24
+ )
25
+
26
+ profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
27
+ default_factory=int_top_k_dict
28
+ )
29
+
30
+ profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
31
+
32
+ num_tables_not_eligible_profiling: Dict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+
36
+
37
+ @dataclass
38
+ class SQLSourceReport(
39
+ StaleEntityRemovalSourceReport,
40
+ ClassificationReportMixin,
41
+ DetailedProfilerReportMixin,
42
+ ):
43
+ tables_scanned: int = 0
44
+ views_scanned: int = 0
45
+ entities_profiled: int = 0
46
+ filtered: LossyList[str] = field(default_factory=LossyList)
47
+
48
+ query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
49
+
50
+ num_view_definitions_parsed: int = 0
51
+ num_view_definitions_failed_parsing: int = 0
52
+ num_view_definitions_failed_column_parsing: int = 0
53
+ view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
54
+
55
+ def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
56
+ """
57
+ Entity could be a view or a table
58
+ """
59
+ if ent_type == "table":
60
+ self.tables_scanned += 1
61
+ elif ent_type == "view":
62
+ self.views_scanned += 1
63
+ else:
64
+ raise KeyError(f"Unknown entity {ent_type}.")
65
+
66
+ def report_entity_profiled(self, name: str) -> None:
67
+ self.entities_profiled += 1
68
+
69
+ def report_dropped(self, ent_name: str) -> None:
70
+ self.filtered.append(ent_name)
71
+
72
+ def report_from_query_combiner(
73
+ self, query_combiner_report: SQLAlchemyQueryCombinerReport
74
+ ) -> None:
75
+ self.query_combiner = query_combiner_report