acryl-datahub 0.14.1.13rc5__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (27) hide show
  1. {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/METADATA +2305 -2305
  2. {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/RECORD +27 -26
  3. datahub/__init__.py +1 -1
  4. datahub/configuration/kafka_consumer_config.py +4 -1
  5. datahub/ingestion/source/bigquery_v2/bigquery_report.py +2 -2
  6. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +35 -12
  7. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +12 -11
  8. datahub/ingestion/source/dremio/dremio_reporting.py +2 -2
  9. datahub/ingestion/source/ge_data_profiler.py +1 -1
  10. datahub/ingestion/source/ge_profiling_config.py +6 -2
  11. datahub/ingestion/source/redshift/report.py +2 -2
  12. datahub/ingestion/source/snowflake/snowflake_report.py +2 -2
  13. datahub/ingestion/source/sql/oracle.py +50 -0
  14. datahub/ingestion/source/sql/sql_common.py +18 -52
  15. datahub/ingestion/source/sql/sql_generic_profiler.py +3 -32
  16. datahub/ingestion/source/sql/sql_report.py +75 -0
  17. datahub/ingestion/source/sql/teradata.py +2 -2
  18. datahub/ingestion/source/sql/vertica.py +2 -2
  19. datahub/ingestion/source/unity/report.py +2 -2
  20. datahub/metadata/schema.avsc +1 -1
  21. datahub/metadata/schemas/AssertionInfo.avsc +1 -1
  22. datahub/metadata/schemas/InputFields.avsc +1 -1
  23. datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
  24. datahub/metadata/schemas/SchemaMetadata.avsc +1 -1
  25. {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/WHEEL +0 -0
  26. {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/entry_points.txt +0 -0
  27. {acryl_datahub-0.14.1.13rc5.dist-info → acryl_datahub-0.14.1.13rc6.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from abc import abstractmethod
3
- from dataclasses import dataclass, field
3
+ from dataclasses import dataclass
4
4
  from datetime import datetime, timedelta, timezone
5
5
  from typing import Dict, Iterable, List, Optional, Union, cast
6
6
 
@@ -14,42 +14,13 @@ from datahub.ingestion.source.ge_data_profiler import (
14
14
  DatahubGEProfiler,
15
15
  GEProfilerRequest,
16
16
  )
17
- from datahub.ingestion.source.sql.sql_common import SQLSourceReport
18
17
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
19
18
  from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView
19
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
20
20
  from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern
21
21
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
22
22
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType
24
- from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
25
-
26
-
27
- @dataclass
28
- class DetailedProfilerReportMixin:
29
- profiling_skipped_not_updated: TopKDict[str, int] = field(
30
- default_factory=int_top_k_dict
31
- )
32
- profiling_skipped_size_limit: TopKDict[str, int] = field(
33
- default_factory=int_top_k_dict
34
- )
35
-
36
- profiling_skipped_row_limit: TopKDict[str, int] = field(
37
- default_factory=int_top_k_dict
38
- )
39
-
40
- profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
41
- default_factory=int_top_k_dict
42
- )
43
-
44
- profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
45
-
46
- num_tables_not_eligible_profiling: Dict[str, int] = field(
47
- default_factory=int_top_k_dict
48
- )
49
-
50
-
51
- class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport):
52
- pass
53
24
 
54
25
 
55
26
  @dataclass
@@ -65,7 +36,7 @@ class GenericProfiler:
65
36
  def __init__(
66
37
  self,
67
38
  config: SQLCommonConfig,
68
- report: ProfilingSqlReport,
39
+ report: SQLSourceReport,
69
40
  platform: str,
70
41
  state_handler: Optional[ProfilingHandler] = None,
71
42
  ) -> None:
@@ -0,0 +1,75 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, Optional
3
+
4
+ from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
5
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
+ StaleEntityRemovalSourceReport,
7
+ )
8
+ from datahub.utilities.lossy_collections import LossyList
9
+ from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
10
+ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
11
+
12
+
13
+ @dataclass
14
+ class DetailedProfilerReportMixin:
15
+ profiling_skipped_not_updated: TopKDict[str, int] = field(
16
+ default_factory=int_top_k_dict
17
+ )
18
+ profiling_skipped_size_limit: TopKDict[str, int] = field(
19
+ default_factory=int_top_k_dict
20
+ )
21
+
22
+ profiling_skipped_row_limit: TopKDict[str, int] = field(
23
+ default_factory=int_top_k_dict
24
+ )
25
+
26
+ profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
27
+ default_factory=int_top_k_dict
28
+ )
29
+
30
+ profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
31
+
32
+ num_tables_not_eligible_profiling: Dict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+
36
+
37
+ @dataclass
38
+ class SQLSourceReport(
39
+ StaleEntityRemovalSourceReport,
40
+ ClassificationReportMixin,
41
+ DetailedProfilerReportMixin,
42
+ ):
43
+ tables_scanned: int = 0
44
+ views_scanned: int = 0
45
+ entities_profiled: int = 0
46
+ filtered: LossyList[str] = field(default_factory=LossyList)
47
+
48
+ query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
49
+
50
+ num_view_definitions_parsed: int = 0
51
+ num_view_definitions_failed_parsing: int = 0
52
+ num_view_definitions_failed_column_parsing: int = 0
53
+ view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
54
+
55
+ def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
56
+ """
57
+ Entity could be a view or a table
58
+ """
59
+ if ent_type == "table":
60
+ self.tables_scanned += 1
61
+ elif ent_type == "view":
62
+ self.views_scanned += 1
63
+ else:
64
+ raise KeyError(f"Unknown entity {ent_type}.")
65
+
66
+ def report_entity_profiled(self, name: str) -> None:
67
+ self.entities_profiled += 1
68
+
69
+ def report_dropped(self, ent_name: str) -> None:
70
+ self.filtered.append(ent_name)
71
+
72
+ def report_from_query_combiner(
73
+ self, query_combiner_report: SQLAlchemyQueryCombinerReport
74
+ ) -> None:
75
+ self.query_combiner = query_combiner_report
@@ -44,7 +44,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
44
44
  from datahub.ingestion.graph.client import DataHubGraph
45
45
  from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type
46
46
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
47
- from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
47
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
48
48
  from datahub.ingestion.source.sql.two_tier_sql_source import (
49
49
  TwoTierSQLAlchemyConfig,
50
50
  TwoTierSQLAlchemySource,
@@ -330,7 +330,7 @@ def optimized_get_view_definition(
330
330
 
331
331
 
332
332
  @dataclass
333
- class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport):
333
+ class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport):
334
334
  num_queries_parsed: int = 0
335
335
  num_view_ddl_parsed: int = 0
336
336
  num_table_parse_failures: int = 0
@@ -27,7 +27,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
28
  from datahub.ingestion.source.sql.sql_common import (
29
29
  SQLAlchemySource,
30
- SQLSourceReport,
31
30
  SqlWorkUnit,
32
31
  get_schema_metadata,
33
32
  )
@@ -35,6 +34,7 @@ from datahub.ingestion.source.sql.sql_config import (
35
34
  BasicSQLAlchemyConfig,
36
35
  SQLCommonConfig,
37
36
  )
37
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
38
38
  from datahub.ingestion.source.sql.sql_utils import get_domain_wu
39
39
  from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
40
40
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
@@ -536,7 +536,7 @@ class VerticaSource(SQLAlchemySource):
536
536
  )
537
537
 
538
538
  if not self.is_dataset_eligible_for_profiling(
539
- dataset_name, sql_config, inspector, profile_candidates
539
+ dataset_name, schema, inspector, profile_candidates
540
540
  ):
541
541
  if self.config.profiling.report_dropped_profiles:
542
542
  self.report.report_dropped(f"profile of {dataset_name}")
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
2
2
  from typing import Optional, Tuple
3
3
 
4
4
  from datahub.ingestion.api.report import EntityFilterReport, Report
5
- from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport
5
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
6
6
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
7
7
  from datahub.utilities.lossy_collections import LossyDict, LossyList
8
8
  from datahub.utilities.perf_timer import PerfTimer
@@ -19,7 +19,7 @@ class UnityCatalogUsagePerfReport(Report):
19
19
 
20
20
 
21
21
  @dataclass
22
- class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport):
22
+ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
23
23
  metastores: EntityFilterReport = EntityFilterReport.field(type="metastore")
24
24
  catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog")
25
25
  schemas: EntityFilterReport = EntityFilterReport.field(type="schema")
@@ -6005,7 +6005,7 @@
6005
6005
  "fields": [
6006
6006
  {
6007
6007
  "Searchable": {
6008
- "boostScore": 5.0,
6008
+ "boostScore": 1.0,
6009
6009
  "fieldName": "fieldPaths",
6010
6010
  "fieldType": "TEXT",
6011
6011
  "queryByDefault": "true"
@@ -1542,7 +1542,7 @@
1542
1542
  "fields": [
1543
1543
  {
1544
1544
  "Searchable": {
1545
- "boostScore": 5.0,
1545
+ "boostScore": 1.0,
1546
1546
  "fieldName": "fieldPaths",
1547
1547
  "fieldType": "TEXT",
1548
1548
  "queryByDefault": "true"
@@ -42,7 +42,7 @@
42
42
  "fields": [
43
43
  {
44
44
  "Searchable": {
45
- "boostScore": 5.0,
45
+ "boostScore": 1.0,
46
46
  "fieldName": "fieldPaths",
47
47
  "fieldType": "TEXT",
48
48
  "queryByDefault": "true"
@@ -4013,7 +4013,7 @@
4013
4013
  "fields": [
4014
4014
  {
4015
4015
  "Searchable": {
4016
- "boostScore": 5.0,
4016
+ "boostScore": 1.0,
4017
4017
  "fieldName": "fieldPaths",
4018
4018
  "fieldType": "TEXT",
4019
4019
  "queryByDefault": "true"
@@ -309,7 +309,7 @@
309
309
  "fields": [
310
310
  {
311
311
  "Searchable": {
312
- "boostScore": 5.0,
312
+ "boostScore": 1.0,
313
313
  "fieldName": "fieldPaths",
314
314
  "fieldType": "TEXT",
315
315
  "queryByDefault": "true"