acryl-datahub 0.14.1.13rc5__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2305 -2305
- {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +27 -26
- datahub/__init__.py +1 -1
- datahub/configuration/kafka_consumer_config.py +4 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
- datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
- datahub/ingestion/source/ge_data_profiler.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +6 -2
- datahub/ingestion/source/redshift/report.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
- datahub/ingestion/source/sql/oracle.py +50 -0
- datahub/ingestion/source/sql/sql_common.py +18 -52
- datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
- datahub/ingestion/source/sql/sql_report.py +75 -0
- datahub/ingestion/source/sql/teradata.py +2 -2
- datahub/ingestion/source/sql/vertica.py +2 -2
- datahub/ingestion/source/unity/report.py +2 -2
- datahub/metadata/schema.avsc +1 -1
- datahub/metadata/schemas/AssertionInfo.avsc +1 -1
- datahub/metadata/schemas/InputFields.avsc +1 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
- {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from abc import abstractmethod
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime, timedelta, timezone
|
|
5
5
|
from typing import Dict, Iterable, List, Optional, Union, cast
|
|
6
6
|
|
|
@@ -14,42 +14,13 @@ from datahub.ingestion.source.ge_data_profiler import (
|
|
|
14
14
|
DatahubGEProfiler,
|
|
15
15
|
GEProfilerRequest,
|
|
16
16
|
)
|
|
17
|
-
from datahub.ingestion.source.sql.sql_common import SQLSourceReport
|
|
18
17
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
19
18
|
from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView
|
|
19
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
20
20
|
from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern
|
|
21
21
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
22
22
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
|
|
23
23
|
from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType
|
|
24
|
-
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass
|
|
28
|
-
class DetailedProfilerReportMixin:
|
|
29
|
-
profiling_skipped_not_updated: TopKDict[str, int] = field(
|
|
30
|
-
default_factory=int_top_k_dict
|
|
31
|
-
)
|
|
32
|
-
profiling_skipped_size_limit: TopKDict[str, int] = field(
|
|
33
|
-
default_factory=int_top_k_dict
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
profiling_skipped_row_limit: TopKDict[str, int] = field(
|
|
37
|
-
default_factory=int_top_k_dict
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
41
|
-
default_factory=int_top_k_dict
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
45
|
-
|
|
46
|
-
num_tables_not_eligible_profiling: Dict[str, int] = field(
|
|
47
|
-
default_factory=int_top_k_dict
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport):
|
|
52
|
-
pass
|
|
53
24
|
|
|
54
25
|
|
|
55
26
|
@dataclass
|
|
@@ -65,7 +36,7 @@ class GenericProfiler:
|
|
|
65
36
|
def __init__(
|
|
66
37
|
self,
|
|
67
38
|
config: SQLCommonConfig,
|
|
68
|
-
report:
|
|
39
|
+
report: SQLSourceReport,
|
|
69
40
|
platform: str,
|
|
70
41
|
state_handler: Optional[ProfilingHandler] = None,
|
|
71
42
|
) -> None:
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
|
|
5
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
|
+
StaleEntityRemovalSourceReport,
|
|
7
|
+
)
|
|
8
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
9
|
+
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
|
10
|
+
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DetailedProfilerReportMixin:
|
|
15
|
+
profiling_skipped_not_updated: TopKDict[str, int] = field(
|
|
16
|
+
default_factory=int_top_k_dict
|
|
17
|
+
)
|
|
18
|
+
profiling_skipped_size_limit: TopKDict[str, int] = field(
|
|
19
|
+
default_factory=int_top_k_dict
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
profiling_skipped_row_limit: TopKDict[str, int] = field(
|
|
23
|
+
default_factory=int_top_k_dict
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
27
|
+
default_factory=int_top_k_dict
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
31
|
+
|
|
32
|
+
num_tables_not_eligible_profiling: Dict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SQLSourceReport(
|
|
39
|
+
StaleEntityRemovalSourceReport,
|
|
40
|
+
ClassificationReportMixin,
|
|
41
|
+
DetailedProfilerReportMixin,
|
|
42
|
+
):
|
|
43
|
+
tables_scanned: int = 0
|
|
44
|
+
views_scanned: int = 0
|
|
45
|
+
entities_profiled: int = 0
|
|
46
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
47
|
+
|
|
48
|
+
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
|
49
|
+
|
|
50
|
+
num_view_definitions_parsed: int = 0
|
|
51
|
+
num_view_definitions_failed_parsing: int = 0
|
|
52
|
+
num_view_definitions_failed_column_parsing: int = 0
|
|
53
|
+
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
54
|
+
|
|
55
|
+
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
56
|
+
"""
|
|
57
|
+
Entity could be a view or a table
|
|
58
|
+
"""
|
|
59
|
+
if ent_type == "table":
|
|
60
|
+
self.tables_scanned += 1
|
|
61
|
+
elif ent_type == "view":
|
|
62
|
+
self.views_scanned += 1
|
|
63
|
+
else:
|
|
64
|
+
raise KeyError(f"Unknown entity {ent_type}.")
|
|
65
|
+
|
|
66
|
+
def report_entity_profiled(self, name: str) -> None:
|
|
67
|
+
self.entities_profiled += 1
|
|
68
|
+
|
|
69
|
+
def report_dropped(self, ent_name: str) -> None:
|
|
70
|
+
self.filtered.append(ent_name)
|
|
71
|
+
|
|
72
|
+
def report_from_query_combiner(
|
|
73
|
+
self, query_combiner_report: SQLAlchemyQueryCombinerReport
|
|
74
|
+
) -> None:
|
|
75
|
+
self.query_combiner = query_combiner_report
|
|
@@ -44,7 +44,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
44
44
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
45
45
|
from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
|
|
46
46
|
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
|
|
47
|
-
from datahub.ingestion.source.sql.
|
|
47
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
48
48
|
from datahub.ingestion.source.sql.two_tier_sql_source import (
|
|
49
49
|
TwoTierSQLAlchemyConfig,
|
|
50
50
|
TwoTierSQLAlchemySource,
|
|
@@ -330,7 +330,7 @@ def optimized_get_view_definition(
|
|
|
330
330
|
|
|
331
331
|
|
|
332
332
|
@dataclass
|
|
333
|
-
class TeradataReport(
|
|
333
|
+
class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
|
|
334
334
|
num_queries_parsed: int = 0
|
|
335
335
|
num_view_ddl_parsed: int = 0
|
|
336
336
|
num_table_parse_failures: int = 0
|
|
@@ -27,7 +27,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
27
27
|
from datahub.ingestion.source.common.data_reader import DataReader
|
|
28
28
|
from datahub.ingestion.source.sql.sql_common import (
|
|
29
29
|
SQLAlchemySource,
|
|
30
|
-
SQLSourceReport,
|
|
31
30
|
SqlWorkUnit,
|
|
32
31
|
get_schema_metadata,
|
|
33
32
|
)
|
|
@@ -35,6 +34,7 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
35
34
|
BasicSQLAlchemyConfig,
|
|
36
35
|
SQLCommonConfig,
|
|
37
36
|
)
|
|
37
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
38
38
|
from datahub.ingestion.source.sql.sql_utils import get_domain_wu
|
|
39
39
|
from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
|
|
40
40
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
|
@@ -536,7 +536,7 @@ class VerticaSource(SQLAlchemySource):
|
|
|
536
536
|
)
|
|
537
537
|
|
|
538
538
|
if not self.is_dataset_eligible_for_profiling(
|
|
539
|
-
dataset_name,
|
|
539
|
+
dataset_name, schema, inspector, profile_candidates
|
|
540
540
|
):
|
|
541
541
|
if self.config.profiling.report_dropped_profiles:
|
|
542
542
|
self.report.report_dropped(f"profile of {dataset_name}")
|
|
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from typing import Optional, Tuple
|
|
3
3
|
|
|
4
4
|
from datahub.ingestion.api.report import EntityFilterReport, Report
|
|
5
|
-
from datahub.ingestion.source.sql.
|
|
5
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
6
6
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
7
7
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
8
8
|
from datahub.utilities.perf_timer import PerfTimer
|
|
@@ -19,7 +19,7 @@ class UnityCatalogUsagePerfReport(Report):
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
@dataclass
|
|
22
|
-
class UnityCatalogReport(IngestionStageReport,
|
|
22
|
+
class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
23
23
|
metastores: EntityFilterReport = EntityFilterReport.field(type="metastore")
|
|
24
24
|
catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog")
|
|
25
25
|
schemas: EntityFilterReport = EntityFilterReport.field(type="schema")
|
datahub/metadata/schema.avsc
CHANGED
|
File without changes
|
{acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|