acryl-datahub 0.15.0.5rc9__py3-none-any.whl → 0.15.0.6rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/METADATA +2431 -2431
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/RECORD +46 -45
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +2 -1
- datahub/ingestion/graph/entity_versioning.py +201 -0
- datahub/ingestion/source/abs/report.py +2 -2
- datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/delta_lake/report.py +2 -2
- datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
- datahub/ingestion/source/elastic_search.py +2 -1
- datahub/ingestion/source/ge_profiling_config.py +11 -7
- datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
- datahub/ingestion/source/identity/okta.py +2 -1
- datahub/ingestion/source/kafka/kafka.py +2 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -1
- datahub/ingestion/source/ldap.py +2 -1
- datahub/ingestion/source/looker/lookml_config.py +9 -5
- datahub/ingestion/source/mongodb.py +2 -1
- datahub/ingestion/source/nifi.py +2 -1
- datahub/ingestion/source/powerbi/config.py +3 -2
- datahub/ingestion/source/powerbi/powerbi.py +28 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +11 -36
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +17 -4
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
- datahub/ingestion/source/redash.py +5 -5
- datahub/ingestion/source/salesforce.py +4 -1
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +45 -10
- datahub/ingestion/source/snowflake/snowflake_query.py +20 -1
- datahub/ingestion/source/snowflake/snowflake_report.py +8 -1
- datahub/ingestion/source/snowflake/snowflake_schema.py +98 -4
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +294 -62
- datahub/ingestion/source/snowflake/snowflake_utils.py +17 -8
- datahub/ingestion/source/snowflake/snowflake_v2.py +15 -3
- datahub/ingestion/source/tableau/tableau.py +2 -1
- datahub/ingestion/source/unity/ge_profiler.py +55 -4
- datahub/ingestion/source/unity/report.py +1 -0
- datahub/ingestion/source_report/pulsar.py +5 -4
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.5rc9.dist-info → acryl_datahub-0.15.0.6rc1.dist-info}/top_level.txt +0 -0
|
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
115
115
|
)
|
|
116
116
|
max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
|
|
117
117
|
default=None,
|
|
118
|
-
description="A positive integer that specifies the maximum number of columns to profile for
|
|
118
|
+
description="A positive integer that specifies the maximum number of columns to profile for "
|
|
119
|
+
"any table. `None` implies all columns. The cost of profiling goes up significantly as the "
|
|
120
|
+
"number of columns to profile goes up.",
|
|
119
121
|
)
|
|
120
122
|
|
|
121
123
|
profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
|
|
122
124
|
default=None,
|
|
123
|
-
description="Profile table only if it has been updated since these many number of days.
|
|
125
|
+
description="Profile table only if it has been updated since these many number of days. "
|
|
126
|
+
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
|
+
"Supported only in `snowflake` and `BigQuery`.",
|
|
124
128
|
)
|
|
125
129
|
|
|
126
130
|
profile_table_size_limit: Optional[int] = Field(
|
|
127
131
|
default=5,
|
|
128
132
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
129
|
-
"no limit on the size of tables to profile. Supported only in `
|
|
130
|
-
"Supported for `
|
|
133
|
+
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
134
|
+
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
131
135
|
)
|
|
132
136
|
|
|
133
137
|
profile_table_row_limit: Optional[int] = Field(
|
|
134
138
|
default=5000000,
|
|
135
|
-
description="Profile tables only if their row count is less than specified count.
|
|
136
|
-
"no limit on the row count of tables to profile. Supported only in
|
|
137
|
-
"Supported for `
|
|
139
|
+
description="Profile tables only if their row count is less than specified count. "
|
|
140
|
+
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
141
|
+
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
138
142
|
)
|
|
139
143
|
|
|
140
144
|
profile_table_row_count_estimate_only: bool = Field(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import Any, Dict,
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
4
|
|
|
5
5
|
from humanfriendly import format_timespan
|
|
6
6
|
from pydantic import Field, validator
|
|
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
|
|
|
20
20
|
OperationConfig,
|
|
21
21
|
is_profiling_enabled,
|
|
22
22
|
)
|
|
23
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
23
24
|
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
24
25
|
|
|
25
26
|
logger = logging.getLogger(__name__)
|
|
@@ -198,7 +199,7 @@ class TimingClass:
|
|
|
198
199
|
class IcebergSourceReport(StaleEntityRemovalSourceReport):
|
|
199
200
|
tables_scanned: int = 0
|
|
200
201
|
entities_profiled: int = 0
|
|
201
|
-
filtered:
|
|
202
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
202
203
|
load_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
203
204
|
processing_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
204
205
|
profiling_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
|
|
|
50
50
|
OriginTypeClass,
|
|
51
51
|
StatusClass,
|
|
52
52
|
)
|
|
53
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
53
54
|
|
|
54
55
|
logger = logging.getLogger(__name__)
|
|
55
56
|
nest_asyncio.apply()
|
|
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
|
|
|
173
174
|
|
|
174
175
|
@dataclass
|
|
175
176
|
class OktaSourceReport(StaleEntityRemovalSourceReport):
|
|
176
|
-
filtered:
|
|
177
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
177
178
|
|
|
178
179
|
def report_filtered(self, name: str) -> None:
|
|
179
180
|
self.filtered.append(name)
|
|
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
|
|
|
73
73
|
OwnershipSourceTypeClass,
|
|
74
74
|
SubTypesClass,
|
|
75
75
|
)
|
|
76
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
76
77
|
from datahub.utilities.mapping import Constants, OperationProcessor
|
|
77
78
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
78
79
|
from datahub.utilities.str_enum import StrEnum
|
|
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
|
|
|
190
191
|
@dataclass
|
|
191
192
|
class KafkaSourceReport(StaleEntityRemovalSourceReport):
|
|
192
193
|
topics_scanned: int = 0
|
|
193
|
-
filtered:
|
|
194
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
195
|
|
|
195
196
|
def report_topic_scanned(self, topic: str) -> None:
|
|
196
197
|
self.topics_scanned += 1
|
|
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
16
16
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
17
|
StatefulIngestionConfigBase,
|
|
18
18
|
)
|
|
19
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
|
|
|
83
84
|
@dataclass
|
|
84
85
|
class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
|
|
85
86
|
connectors_scanned: int = 0
|
|
86
|
-
filtered:
|
|
87
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
87
88
|
|
|
88
89
|
def report_connector_scanned(self, connector: str) -> None:
|
|
89
90
|
self.connectors_scanned += 1
|
datahub/ingestion/source/ldap.py
CHANGED
|
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
|
|
|
37
37
|
CorpUserSnapshotClass,
|
|
38
38
|
GroupMembershipClass,
|
|
39
39
|
)
|
|
40
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
40
41
|
|
|
41
42
|
# default mapping for attrs
|
|
42
43
|
user_attrs_map: Dict[str, Any] = {}
|
|
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
|
|
|
160
161
|
|
|
161
162
|
@dataclasses.dataclass
|
|
162
163
|
class LDAPSourceReport(StaleEntityRemovalSourceReport):
|
|
163
|
-
dropped_dns:
|
|
164
|
+
dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
164
165
|
|
|
165
166
|
def report_dropped(self, dn: str) -> None:
|
|
166
167
|
self.dropped_dns.append(dn)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field as dataclass_field
|
|
3
3
|
from datetime import timedelta
|
|
4
|
-
from typing import Any, Dict,
|
|
4
|
+
from typing import Any, Dict, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import root_validator, validator
|
|
@@ -48,13 +48,17 @@ DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}"
|
|
|
48
48
|
class LookMLSourceReport(StaleEntityRemovalSourceReport):
|
|
49
49
|
git_clone_latency: Optional[timedelta] = None
|
|
50
50
|
models_discovered: int = 0
|
|
51
|
-
models_dropped:
|
|
51
|
+
models_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
52
52
|
views_discovered: int = 0
|
|
53
|
-
views_dropped:
|
|
54
|
-
views_dropped_unreachable:
|
|
53
|
+
views_dropped: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
54
|
+
views_dropped_unreachable: LossyList[str] = dataclass_field(
|
|
55
|
+
default_factory=LossyList
|
|
56
|
+
)
|
|
55
57
|
query_parse_attempts: int = 0
|
|
56
58
|
query_parse_failures: int = 0
|
|
57
|
-
query_parse_failure_views:
|
|
59
|
+
query_parse_failure_views: LossyList[str] = dataclass_field(
|
|
60
|
+
default_factory=LossyList
|
|
61
|
+
)
|
|
58
62
|
_looker_api: Optional[LookerAPI] = None
|
|
59
63
|
|
|
60
64
|
def report_models_scanned(self) -> None:
|
|
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
|
|
|
68
68
|
UnionTypeClass,
|
|
69
69
|
)
|
|
70
70
|
from datahub.metadata.urns import DatasetUrn
|
|
71
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
71
72
|
|
|
72
73
|
logger = logging.getLogger(__name__)
|
|
73
74
|
|
|
@@ -143,7 +144,7 @@ class MongoDBConfig(
|
|
|
143
144
|
|
|
144
145
|
@dataclass
|
|
145
146
|
class MongoDBSourceReport(StaleEntityRemovalSourceReport):
|
|
146
|
-
filtered:
|
|
147
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
147
148
|
|
|
148
149
|
def report_dropped(self, name: str) -> None:
|
|
149
150
|
self.filtered.append(name)
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -46,6 +46,7 @@ from datahub.metadata.schema_classes import (
|
|
|
46
46
|
DatasetPropertiesClass,
|
|
47
47
|
)
|
|
48
48
|
from datahub.specific.datajob import DataJobPatchBuilder
|
|
49
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
49
50
|
|
|
50
51
|
logger = logging.getLogger(__name__)
|
|
51
52
|
NIFI = "nifi"
|
|
@@ -452,7 +453,7 @@ def get_attribute_value(attr_lst: List[dict], attr_name: str) -> Optional[str]:
|
|
|
452
453
|
|
|
453
454
|
@dataclass
|
|
454
455
|
class NifiSourceReport(SourceReport):
|
|
455
|
-
filtered:
|
|
456
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
456
457
|
|
|
457
458
|
def report_dropped(self, ent_name: str) -> None:
|
|
458
459
|
self.filtered.append(ent_name)
|
|
@@ -132,6 +132,7 @@ class Constant:
|
|
|
132
132
|
ACTIVE = "Active"
|
|
133
133
|
SQL_PARSING_FAILURE = "SQL Parsing Failure"
|
|
134
134
|
M_QUERY_NULL = '"null"'
|
|
135
|
+
REPORT_WEB_URL = "reportWebUrl"
|
|
135
136
|
|
|
136
137
|
|
|
137
138
|
@dataclass
|
|
@@ -195,8 +196,8 @@ class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
|
195
196
|
|
|
196
197
|
dashboards_scanned: int = 0
|
|
197
198
|
charts_scanned: int = 0
|
|
198
|
-
filtered_dashboards:
|
|
199
|
-
filtered_charts:
|
|
199
|
+
filtered_dashboards: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
200
|
+
filtered_charts: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
200
201
|
|
|
201
202
|
m_query_parse_timer: PerfTimer = dataclass_field(default_factory=PerfTimer)
|
|
202
203
|
m_query_parse_attempts: int = 0
|
|
@@ -582,8 +582,11 @@ class Mapper:
|
|
|
582
582
|
if tile.dataset is not None and tile.dataset.webUrl is not None:
|
|
583
583
|
custom_properties[Constant.DATASET_WEB_URL] = tile.dataset.webUrl
|
|
584
584
|
|
|
585
|
-
if tile.
|
|
586
|
-
custom_properties[Constant.REPORT_ID] = tile.
|
|
585
|
+
if tile.report_id is not None:
|
|
586
|
+
custom_properties[Constant.REPORT_ID] = tile.report_id
|
|
587
|
+
|
|
588
|
+
if tile.report is not None and tile.report.webUrl is not None:
|
|
589
|
+
custom_properties[Constant.REPORT_WEB_URL] = tile.report.webUrl
|
|
587
590
|
|
|
588
591
|
return custom_properties
|
|
589
592
|
|
|
@@ -1053,6 +1056,7 @@ class Mapper:
|
|
|
1053
1056
|
report: powerbi_data_classes.Report,
|
|
1054
1057
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
1055
1058
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
1059
|
+
dashboard_edges: List[EdgeClass],
|
|
1056
1060
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1057
1061
|
"""
|
|
1058
1062
|
Map PowerBi report to Datahub dashboard
|
|
@@ -1074,6 +1078,7 @@ class Mapper:
|
|
|
1074
1078
|
charts=chart_urn_list,
|
|
1075
1079
|
lastModified=ChangeAuditStamps(),
|
|
1076
1080
|
dashboardUrl=report.webUrl,
|
|
1081
|
+
dashboards=dashboard_edges,
|
|
1077
1082
|
)
|
|
1078
1083
|
|
|
1079
1084
|
info_mcp = self.new_mcp(
|
|
@@ -1167,8 +1172,28 @@ class Mapper:
|
|
|
1167
1172
|
ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
|
|
1168
1173
|
chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
|
|
1169
1174
|
|
|
1175
|
+
# find all dashboards with a Tile referencing this report
|
|
1176
|
+
downstream_dashboards_edges = []
|
|
1177
|
+
for d in workspace.dashboards.values():
|
|
1178
|
+
if any(t.report_id == report.id for t in d.tiles):
|
|
1179
|
+
dashboard_urn = builder.make_dashboard_urn(
|
|
1180
|
+
platform=self.__config.platform_name,
|
|
1181
|
+
platform_instance=self.__config.platform_instance,
|
|
1182
|
+
name=d.get_urn_part(),
|
|
1183
|
+
)
|
|
1184
|
+
edge = EdgeClass(
|
|
1185
|
+
destinationUrn=dashboard_urn,
|
|
1186
|
+
sourceUrn=None,
|
|
1187
|
+
created=None,
|
|
1188
|
+
lastModified=None,
|
|
1189
|
+
properties=None,
|
|
1190
|
+
)
|
|
1191
|
+
downstream_dashboards_edges.append(edge)
|
|
1192
|
+
|
|
1170
1193
|
# Let's convert report to datahub dashboard
|
|
1171
|
-
report_mcps = self.report_to_dashboard(
|
|
1194
|
+
report_mcps = self.report_to_dashboard(
|
|
1195
|
+
workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
|
|
1196
|
+
)
|
|
1172
1197
|
|
|
1173
1198
|
# Now add MCPs in sequence
|
|
1174
1199
|
mcps.extend(ds_mcps)
|
|
@@ -286,11 +286,15 @@ class Tile:
|
|
|
286
286
|
id: str
|
|
287
287
|
title: str
|
|
288
288
|
embedUrl: str
|
|
289
|
-
dataset: Optional["PowerBIDataset"]
|
|
290
289
|
dataset_id: Optional[str]
|
|
291
|
-
|
|
290
|
+
report_id: Optional[str]
|
|
292
291
|
createdFrom: CreatedFrom
|
|
293
292
|
|
|
293
|
+
# In a first pass, `dataset_id` and/or `report_id` are filled in.
|
|
294
|
+
# In a subsequent pass, the objects are populated.
|
|
295
|
+
dataset: Optional["PowerBIDataset"]
|
|
296
|
+
report: Optional[Report]
|
|
297
|
+
|
|
294
298
|
def get_urn_part(self):
|
|
295
299
|
return f"charts.{self.id}"
|
|
296
300
|
|
|
@@ -337,41 +337,6 @@ class DataResolverBase(ABC):
|
|
|
337
337
|
-tiles), there is no information available on pagination
|
|
338
338
|
|
|
339
339
|
"""
|
|
340
|
-
|
|
341
|
-
def new_dataset_or_report(tile_instance: Any) -> dict:
|
|
342
|
-
"""
|
|
343
|
-
Find out which is the data source for tile. It is either REPORT or DATASET
|
|
344
|
-
"""
|
|
345
|
-
report_fields = {
|
|
346
|
-
Constant.REPORT: (
|
|
347
|
-
self.get_report(
|
|
348
|
-
workspace=workspace,
|
|
349
|
-
report_id=tile_instance.get(Constant.REPORT_ID),
|
|
350
|
-
)
|
|
351
|
-
if tile_instance.get(Constant.REPORT_ID) is not None
|
|
352
|
-
else None
|
|
353
|
-
),
|
|
354
|
-
Constant.CREATED_FROM: Tile.CreatedFrom.UNKNOWN,
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
# reportId and datasetId are exclusive in tile_instance
|
|
358
|
-
# if datasetId is present that means tile is created from dataset
|
|
359
|
-
# if reportId is present that means tile is created from report
|
|
360
|
-
# if both i.e. reportId and datasetId are not present then tile is created from some visualization
|
|
361
|
-
if tile_instance.get(Constant.REPORT_ID) is not None:
|
|
362
|
-
report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.REPORT
|
|
363
|
-
elif tile_instance.get(Constant.DATASET_ID) is not None:
|
|
364
|
-
report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.DATASET
|
|
365
|
-
else:
|
|
366
|
-
report_fields[Constant.CREATED_FROM] = Tile.CreatedFrom.VISUALIZATION
|
|
367
|
-
|
|
368
|
-
title: Optional[str] = tile_instance.get(Constant.TITLE)
|
|
369
|
-
_id: Optional[str] = tile_instance.get(Constant.ID)
|
|
370
|
-
created_from: Any = report_fields[Constant.CREATED_FROM]
|
|
371
|
-
logger.info(f"Tile {title}({_id}) is created from {created_from}")
|
|
372
|
-
|
|
373
|
-
return report_fields
|
|
374
|
-
|
|
375
340
|
tile_list_endpoint: str = self.get_tiles_endpoint(
|
|
376
341
|
workspace, dashboard_id=dashboard.id
|
|
377
342
|
)
|
|
@@ -393,8 +358,18 @@ class DataResolverBase(ABC):
|
|
|
393
358
|
title=instance.get(Constant.TITLE),
|
|
394
359
|
embedUrl=instance.get(Constant.EMBED_URL),
|
|
395
360
|
dataset_id=instance.get(Constant.DATASET_ID),
|
|
361
|
+
report_id=instance.get(Constant.REPORT_ID),
|
|
396
362
|
dataset=None,
|
|
397
|
-
|
|
363
|
+
report=None,
|
|
364
|
+
createdFrom=(
|
|
365
|
+
# In the past we considered that only one of the two report_id or dataset_id would be present
|
|
366
|
+
# but we have seen cases where both are present. If both are present, we prioritize the report.
|
|
367
|
+
Tile.CreatedFrom.REPORT
|
|
368
|
+
if instance.get(Constant.REPORT_ID)
|
|
369
|
+
else Tile.CreatedFrom.DATASET
|
|
370
|
+
if instance.get(Constant.DATASET_ID)
|
|
371
|
+
else Tile.CreatedFrom.VISUALIZATION
|
|
372
|
+
),
|
|
398
373
|
)
|
|
399
374
|
for instance in tile_dict
|
|
400
375
|
if instance is not None
|
|
@@ -625,13 +625,26 @@ class PowerBiAPI:
|
|
|
625
625
|
dashboard.tiles = self._get_resolver().get_tiles(
|
|
626
626
|
workspace, dashboard=dashboard
|
|
627
627
|
)
|
|
628
|
-
# set the dataset for tiles
|
|
628
|
+
# set the dataset and the report for tiles
|
|
629
629
|
for tile in dashboard.tiles:
|
|
630
|
+
# In Power BI, dashboards, reports, and datasets are tightly scoped to the workspace they belong to.
|
|
631
|
+
# https://learn.microsoft.com/en-us/power-bi/collaborate-share/service-new-workspaces
|
|
632
|
+
if tile.report_id:
|
|
633
|
+
tile.report = workspace.reports.get(tile.report_id)
|
|
634
|
+
if tile.report is None:
|
|
635
|
+
self.reporter.info(
|
|
636
|
+
title="Missing Report Lineage For Tile",
|
|
637
|
+
message="A Report reference that failed to be resolved. Please ensure that 'extract_reports' is set to True in the configuration.",
|
|
638
|
+
context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, report-id: {tile.report_id}",
|
|
639
|
+
)
|
|
640
|
+
# However, semantic models (aka datasets) can be shared accross workspaces
|
|
641
|
+
# https://learn.microsoft.com/en-us/fabric/admin/portal-workspace#use-semantic-models-across-workspaces
|
|
642
|
+
# That's why the global 'dataset_registry' is required
|
|
630
643
|
if tile.dataset_id:
|
|
631
644
|
tile.dataset = self.dataset_registry.get(tile.dataset_id)
|
|
632
645
|
if tile.dataset is None:
|
|
633
646
|
self.reporter.info(
|
|
634
|
-
title="Missing Lineage For Tile",
|
|
647
|
+
title="Missing Dataset Lineage For Tile",
|
|
635
648
|
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
|
|
636
649
|
context=f"workspace-name: {workspace.name}, tile-name: {tile.title}, dataset-id: {tile.dataset_id}",
|
|
637
650
|
)
|
|
@@ -653,10 +666,10 @@ class PowerBiAPI:
|
|
|
653
666
|
for dashboard in workspace.dashboards.values():
|
|
654
667
|
dashboard.tags = workspace.dashboard_endorsements.get(dashboard.id, [])
|
|
655
668
|
|
|
669
|
+
# fill reports first since some dashboard may reference a report
|
|
670
|
+
fill_reports()
|
|
656
671
|
if self.__config.extract_dashboards:
|
|
657
672
|
fill_dashboards()
|
|
658
|
-
|
|
659
|
-
fill_reports()
|
|
660
673
|
fill_dashboard_tags()
|
|
661
674
|
self._fill_independent_datasets(workspace=workspace)
|
|
662
675
|
|
|
@@ -53,6 +53,7 @@ from datahub.metadata.schema_classes import (
|
|
|
53
53
|
StatusClass,
|
|
54
54
|
)
|
|
55
55
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
56
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
56
57
|
|
|
57
58
|
LOGGER = logging.getLogger(__name__)
|
|
58
59
|
|
|
@@ -476,7 +477,7 @@ class Mapper:
|
|
|
476
477
|
@dataclass
|
|
477
478
|
class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
478
479
|
scanned_report: int = 0
|
|
479
|
-
filtered_reports:
|
|
480
|
+
filtered_reports: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
480
481
|
|
|
481
482
|
def report_scanned(self, count: int = 1) -> None:
|
|
482
483
|
self.scanned_report += count
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import math
|
|
3
3
|
import sys
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Dict, Iterable, List, Optional
|
|
6
6
|
|
|
7
7
|
import dateutil.parser as dp
|
|
8
8
|
from packaging import version
|
|
@@ -39,7 +39,7 @@ from datahub.metadata.schema_classes import (
|
|
|
39
39
|
DashboardInfoClass,
|
|
40
40
|
)
|
|
41
41
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
42
|
-
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
42
|
+
from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
|
|
43
43
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
44
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
45
45
|
|
|
@@ -280,9 +280,9 @@ class RedashConfig(ConfigModel):
|
|
|
280
280
|
class RedashSourceReport(SourceReport):
|
|
281
281
|
items_scanned: int = 0
|
|
282
282
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
283
|
-
queries_problem_parsing:
|
|
284
|
-
queries_no_dataset:
|
|
285
|
-
charts_no_input:
|
|
283
|
+
queries_problem_parsing: LossySet[str] = field(default_factory=LossySet)
|
|
284
|
+
queries_no_dataset: LossySet[str] = field(default_factory=LossySet)
|
|
285
|
+
charts_no_input: LossySet[str] = field(default_factory=LossySet)
|
|
286
286
|
total_queries: Optional[int] = field(
|
|
287
287
|
default=None,
|
|
288
288
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import time
|
|
4
|
+
from dataclasses import dataclass, field as dataclass_field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -60,6 +61,7 @@ from datahub.metadata.schema_classes import (
|
|
|
60
61
|
TagAssociationClass,
|
|
61
62
|
)
|
|
62
63
|
from datahub.utilities import config_clean
|
|
64
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
63
65
|
|
|
64
66
|
logger = logging.getLogger(__name__)
|
|
65
67
|
|
|
@@ -146,8 +148,9 @@ class SalesforceConfig(DatasetSourceConfigMixin):
|
|
|
146
148
|
return config_clean.remove_trailing_slashes(v)
|
|
147
149
|
|
|
148
150
|
|
|
151
|
+
@dataclass
|
|
149
152
|
class SalesforceSourceReport(SourceReport):
|
|
150
|
-
filtered:
|
|
153
|
+
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
151
154
|
|
|
152
155
|
def report_dropped(self, ent_name: str) -> None:
|
|
153
156
|
self.filtered.append(ent_name)
|
|
@@ -98,6 +98,11 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
98
98
|
)
|
|
99
99
|
# table_pattern and view_pattern are inherited from SQLFilterConfig
|
|
100
100
|
|
|
101
|
+
stream_pattern: AllowDenyPattern = Field(
|
|
102
|
+
default=AllowDenyPattern.allow_all(),
|
|
103
|
+
description="Regex patterns for streams to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
104
|
+
)
|
|
105
|
+
|
|
101
106
|
match_fully_qualified_names: bool = Field(
|
|
102
107
|
default=False,
|
|
103
108
|
description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
|
|
@@ -274,6 +279,11 @@ class SnowflakeV2Config(
|
|
|
274
279
|
description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
|
|
275
280
|
)
|
|
276
281
|
|
|
282
|
+
include_streams: bool = Field(
|
|
283
|
+
default=True,
|
|
284
|
+
description="If enabled, streams will be ingested as separate entities from tables/views.",
|
|
285
|
+
)
|
|
286
|
+
|
|
277
287
|
structured_property_pattern: AllowDenyPattern = Field(
|
|
278
288
|
default=AllowDenyPattern.allow_all(),
|
|
279
289
|
description=(
|
|
@@ -49,6 +49,7 @@ from datahub.metadata.urns import CorpUserUrn
|
|
|
49
49
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
50
50
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
51
51
|
KnownLineageMapping,
|
|
52
|
+
ObservedQuery,
|
|
52
53
|
PreparsedQuery,
|
|
53
54
|
SqlAggregatorReport,
|
|
54
55
|
SqlParsingAggregator,
|
|
@@ -241,7 +242,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
241
242
|
use_cached_audit_log = audit_log_file.exists()
|
|
242
243
|
|
|
243
244
|
queries: FileBackedList[
|
|
244
|
-
Union[
|
|
245
|
+
Union[
|
|
246
|
+
KnownLineageMapping,
|
|
247
|
+
PreparsedQuery,
|
|
248
|
+
TableRename,
|
|
249
|
+
TableSwap,
|
|
250
|
+
ObservedQuery,
|
|
251
|
+
]
|
|
245
252
|
]
|
|
246
253
|
if use_cached_audit_log:
|
|
247
254
|
logger.info("Using cached audit log")
|
|
@@ -252,7 +259,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
252
259
|
|
|
253
260
|
shared_connection = ConnectionWrapper(audit_log_file)
|
|
254
261
|
queries = FileBackedList(shared_connection)
|
|
255
|
-
entry: Union[
|
|
262
|
+
entry: Union[
|
|
263
|
+
KnownLineageMapping,
|
|
264
|
+
PreparsedQuery,
|
|
265
|
+
TableRename,
|
|
266
|
+
TableSwap,
|
|
267
|
+
ObservedQuery,
|
|
268
|
+
]
|
|
256
269
|
|
|
257
270
|
with self.report.copy_history_fetch_timer:
|
|
258
271
|
for entry in self.fetch_copy_history():
|
|
@@ -329,7 +342,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
329
342
|
|
|
330
343
|
def fetch_query_log(
|
|
331
344
|
self, users: UsersMapping
|
|
332
|
-
) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap]]:
|
|
345
|
+
) -> Iterable[Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery]]:
|
|
333
346
|
query_log_query = _build_enriched_query_log_query(
|
|
334
347
|
start_time=self.config.window.start_time,
|
|
335
348
|
end_time=self.config.window.end_time,
|
|
@@ -362,7 +375,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
362
375
|
|
|
363
376
|
def _parse_audit_log_row(
|
|
364
377
|
self, row: Dict[str, Any], users: UsersMapping
|
|
365
|
-
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery]]:
|
|
378
|
+
) -> Optional[Union[TableRename, TableSwap, PreparsedQuery, ObservedQuery]]:
|
|
366
379
|
json_fields = {
|
|
367
380
|
"DIRECT_OBJECTS_ACCESSED",
|
|
368
381
|
"OBJECTS_MODIFIED",
|
|
@@ -398,6 +411,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
398
411
|
pass
|
|
399
412
|
else:
|
|
400
413
|
return None
|
|
414
|
+
|
|
415
|
+
user = CorpUserUrn(
|
|
416
|
+
self.identifiers.get_user_identifier(
|
|
417
|
+
res["user_name"], users.get(res["user_name"])
|
|
418
|
+
)
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Use direct_objects_accessed instead objects_modified
|
|
422
|
+
# objects_modified returns $SYS_VIEW_X with no mapping
|
|
423
|
+
has_stream_objects = any(
|
|
424
|
+
obj.get("objectDomain") == "Stream" for obj in direct_objects_accessed
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# If a stream is used, default to query parsing.
|
|
428
|
+
if has_stream_objects:
|
|
429
|
+
logger.debug("Found matching stream object")
|
|
430
|
+
return ObservedQuery(
|
|
431
|
+
query=res["query_text"],
|
|
432
|
+
session_id=res["session_id"],
|
|
433
|
+
timestamp=res["query_start_time"].astimezone(timezone.utc),
|
|
434
|
+
user=user,
|
|
435
|
+
default_db=res["default_db"],
|
|
436
|
+
default_schema=res["default_schema"],
|
|
437
|
+
query_hash=get_query_fingerprint(
|
|
438
|
+
res["query_text"], self.identifiers.platform, fast=True
|
|
439
|
+
),
|
|
440
|
+
)
|
|
441
|
+
|
|
401
442
|
upstreams = []
|
|
402
443
|
column_usage = {}
|
|
403
444
|
|
|
@@ -460,12 +501,6 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
460
501
|
)
|
|
461
502
|
)
|
|
462
503
|
|
|
463
|
-
user = CorpUserUrn(
|
|
464
|
-
self.identifiers.get_user_identifier(
|
|
465
|
-
res["user_name"], users.get(res["user_name"])
|
|
466
|
-
)
|
|
467
|
-
)
|
|
468
|
-
|
|
469
504
|
timestamp: datetime = res["query_start_time"]
|
|
470
505
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
471
506
|
|
|
@@ -9,6 +9,7 @@ from datahub.ingestion.source.snowflake.snowflake_config import (
|
|
|
9
9
|
from datahub.utilities.prefix_batch_builder import PrefixGroup
|
|
10
10
|
|
|
11
11
|
SHOW_VIEWS_MAX_PAGE_SIZE = 10000
|
|
12
|
+
SHOW_STREAM_MAX_PAGE_SIZE = 10000
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def create_deny_regex_sql_filter(
|
|
@@ -36,6 +37,7 @@ class SnowflakeQuery:
|
|
|
36
37
|
SnowflakeObjectDomain.VIEW.capitalize(),
|
|
37
38
|
SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(),
|
|
38
39
|
SnowflakeObjectDomain.ICEBERG_TABLE.capitalize(),
|
|
40
|
+
SnowflakeObjectDomain.STREAM.capitalize(),
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format(
|
|
@@ -44,7 +46,8 @@ class SnowflakeQuery:
|
|
|
44
46
|
ACCESS_HISTORY_TABLE_DOMAINS_FILTER = (
|
|
45
47
|
"("
|
|
46
48
|
f"'{SnowflakeObjectDomain.TABLE.capitalize()}',"
|
|
47
|
-
f"'{SnowflakeObjectDomain.VIEW.capitalize()}'"
|
|
49
|
+
f"'{SnowflakeObjectDomain.VIEW.capitalize()}',"
|
|
50
|
+
f"'{SnowflakeObjectDomain.STREAM.capitalize()}',"
|
|
48
51
|
")"
|
|
49
52
|
)
|
|
50
53
|
|
|
@@ -963,3 +966,19 @@ WHERE table_schema='{schema_name}' AND {extra_clause}"""
|
|
|
963
966
|
@staticmethod
|
|
964
967
|
def get_all_users() -> str:
|
|
965
968
|
return """SELECT name as "NAME", email as "EMAIL" FROM SNOWFLAKE.ACCOUNT_USAGE.USERS"""
|
|
969
|
+
|
|
970
|
+
@staticmethod
|
|
971
|
+
def streams_for_database(
|
|
972
|
+
db_name: str,
|
|
973
|
+
limit: int = SHOW_STREAM_MAX_PAGE_SIZE,
|
|
974
|
+
stream_pagination_marker: Optional[str] = None,
|
|
975
|
+
) -> str:
|
|
976
|
+
# SHOW STREAMS can return a maximum of 10000 rows.
|
|
977
|
+
# https://docs.snowflake.com/en/sql-reference/sql/show-streams#usage-notes
|
|
978
|
+
assert limit <= SHOW_STREAM_MAX_PAGE_SIZE
|
|
979
|
+
|
|
980
|
+
# To work around this, we paginate through the results using the FROM clause.
|
|
981
|
+
from_clause = (
|
|
982
|
+
f"""FROM '{stream_pagination_marker}'""" if stream_pagination_marker else ""
|
|
983
|
+
)
|
|
984
|
+
return f"""SHOW STREAMS IN DATABASE {db_name} LIMIT {limit} {from_clause};"""
|