acryl-datahub 1.0.0.4rc7__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/METADATA +2509 -2512
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/RECORD +23 -23
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/entrypoints.py +2 -1
- datahub/ingestion/graph/client.py +4 -4
- datahub/ingestion/graph/filters.py +4 -4
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/hex/query_fetcher.py +1 -0
- datahub/ingestion/source/looker/looker_common.py +51 -5
- datahub/ingestion/source/mode.py +268 -174
- datahub/ingestion/source/sql/mssql/source.py +31 -0
- datahub/metadata/_internal_schema_classes.py +476 -476
- datahub/metadata/_urns/urn_defs.py +1703 -1703
- datahub/metadata/schema.avsc +16229 -16229
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +17 -1
- datahub/sql_parsing/sqlglot_lineage.py +342 -23
- datahub/upgrade/upgrade.py +4 -2
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.4rc7.dist-info → acryl_datahub-1.1.0.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
5
6
|
from dataclasses import dataclass
|
|
@@ -9,6 +10,7 @@ from json import JSONDecodeError
|
|
|
9
10
|
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
|
10
11
|
|
|
11
12
|
import dateutil.parser as dp
|
|
13
|
+
import psutil
|
|
12
14
|
import pydantic
|
|
13
15
|
import requests
|
|
14
16
|
import sqlglot
|
|
@@ -114,8 +116,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
114
116
|
)
|
|
115
117
|
from datahub.utilities import config_clean
|
|
116
118
|
from datahub.utilities.lossy_collections import LossyList
|
|
119
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
117
120
|
|
|
118
121
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
122
|
+
# Default API limit for items returned per API call
|
|
123
|
+
# Used for the default per_page value for paginated API requests
|
|
124
|
+
DEFAULT_API_ITEMS_PER_PAGE = 30
|
|
119
125
|
|
|
120
126
|
|
|
121
127
|
class SpaceKey(ContainerKey):
|
|
@@ -194,10 +200,25 @@ class ModeConfig(
|
|
|
194
200
|
default=True, description="Tag measures and dimensions in the schema"
|
|
195
201
|
)
|
|
196
202
|
|
|
203
|
+
items_per_page: int = Field(
|
|
204
|
+
default=DEFAULT_API_ITEMS_PER_PAGE,
|
|
205
|
+
description="Number of items per page for paginated API requests.",
|
|
206
|
+
hidden_from_docs=True,
|
|
207
|
+
)
|
|
208
|
+
|
|
197
209
|
@validator("connect_uri")
|
|
198
210
|
def remove_trailing_slash(cls, v):
|
|
199
211
|
return config_clean.remove_trailing_slashes(v)
|
|
200
212
|
|
|
213
|
+
@validator("items_per_page")
|
|
214
|
+
def validate_items_per_page(cls, v):
|
|
215
|
+
if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
|
|
216
|
+
return v
|
|
217
|
+
else:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
|
|
220
|
+
)
|
|
221
|
+
|
|
201
222
|
|
|
202
223
|
class HTTPError429(HTTPError):
|
|
203
224
|
pass
|
|
@@ -224,6 +245,20 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
224
245
|
num_requests_exceeding_rate_limit: int = 0
|
|
225
246
|
num_requests_retried_on_timeout: int = 0
|
|
226
247
|
num_spaces_retrieved: int = 0
|
|
248
|
+
space_get_api_called: int = 0
|
|
249
|
+
report_get_api_called: int = 0
|
|
250
|
+
dataset_get_api_called: int = 0
|
|
251
|
+
query_get_api_called: int = 0
|
|
252
|
+
chart_get_api_called: int = 0
|
|
253
|
+
get_cache_hits: int = 0
|
|
254
|
+
get_cache_misses: int = 0
|
|
255
|
+
get_cache_size: int = 0
|
|
256
|
+
process_memory_used_mb: float = 0
|
|
257
|
+
space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
258
|
+
report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
259
|
+
dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
260
|
+
query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
261
|
+
chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
227
262
|
|
|
228
263
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
229
264
|
self.filtered_spaces.append(ent_name)
|
|
@@ -583,34 +618,38 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
583
618
|
space_info = {}
|
|
584
619
|
try:
|
|
585
620
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
)
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
|
|
621
|
+
with self.report.space_get_timer:
|
|
622
|
+
for spaces_page in self._get_paged_request_json(
|
|
623
|
+
f"{self.workspace_uri}/spaces?filter=all",
|
|
624
|
+
"spaces",
|
|
625
|
+
self.config.items_per_page,
|
|
626
|
+
):
|
|
627
|
+
self.report.space_get_api_called += 1
|
|
628
|
+
logger.debug(
|
|
629
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
630
|
+
)
|
|
631
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
632
|
+
for s in spaces_page:
|
|
633
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
634
|
+
space_name = s.get("name", "")
|
|
635
|
+
# Using both restricted and default_access_level because
|
|
636
|
+
# there is a current bug with restricted returning False everytime
|
|
637
|
+
# which has been reported to Mode team
|
|
638
|
+
if self.config.exclude_restricted and (
|
|
639
|
+
s.get("restricted")
|
|
640
|
+
or s.get("default_access_level") == "restricted"
|
|
641
|
+
):
|
|
642
|
+
logging.debug(
|
|
643
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
644
|
+
)
|
|
645
|
+
continue
|
|
646
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
647
|
+
self.report.report_dropped_space(space_name)
|
|
648
|
+
logging.debug(
|
|
649
|
+
f"Skipping space {space_name} due to space pattern"
|
|
650
|
+
)
|
|
651
|
+
continue
|
|
652
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
614
653
|
except ModeRequestError as e:
|
|
615
654
|
self.report.report_failure(
|
|
616
655
|
title="Failed to Retrieve Spaces",
|
|
@@ -1414,48 +1453,75 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1414
1453
|
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
1415
1454
|
yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
|
|
1416
1455
|
|
|
1417
|
-
|
|
1418
|
-
def _get_reports(self, space_token: str) -> List[dict]:
|
|
1419
|
-
reports = []
|
|
1456
|
+
def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
|
|
1420
1457
|
try:
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1458
|
+
with self.report.report_get_timer:
|
|
1459
|
+
for reports_page in self._get_paged_request_json(
|
|
1460
|
+
f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
|
|
1461
|
+
"reports",
|
|
1462
|
+
self.config.items_per_page,
|
|
1463
|
+
):
|
|
1464
|
+
self.report.report_get_api_called += 1
|
|
1465
|
+
logger.debug(
|
|
1466
|
+
f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
|
|
1467
|
+
)
|
|
1468
|
+
yield reports_page
|
|
1425
1469
|
except ModeRequestError as e:
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1470
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1471
|
+
self.report.report_warning(
|
|
1472
|
+
title="No Reports Found in Space",
|
|
1473
|
+
message="No reports were found in the space. It may have been recently deleted.",
|
|
1474
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1475
|
+
)
|
|
1476
|
+
else:
|
|
1477
|
+
self.report.report_failure(
|
|
1478
|
+
title="Failed to Retrieve Reports for Space",
|
|
1479
|
+
message="Unable to retrieve reports for space token.",
|
|
1480
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1481
|
+
)
|
|
1432
1482
|
|
|
1433
|
-
|
|
1434
|
-
def _get_datasets(self, space_token: str) -> List[dict]:
|
|
1483
|
+
def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
|
|
1435
1484
|
"""
|
|
1436
1485
|
Retrieves datasets for a given space token.
|
|
1437
1486
|
"""
|
|
1438
|
-
datasets = []
|
|
1439
1487
|
try:
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1488
|
+
with self.report.dataset_get_timer:
|
|
1489
|
+
for dataset_page in self._get_paged_request_json(
|
|
1490
|
+
f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
|
|
1491
|
+
"reports",
|
|
1492
|
+
self.config.items_per_page,
|
|
1493
|
+
):
|
|
1494
|
+
self.report.dataset_get_api_called += 1
|
|
1495
|
+
logger.debug(
|
|
1496
|
+
f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
|
|
1497
|
+
)
|
|
1498
|
+
yield dataset_page
|
|
1443
1499
|
except ModeRequestError as e:
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1500
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1501
|
+
self.report.report_warning(
|
|
1502
|
+
title="No Datasets Found in Space",
|
|
1503
|
+
message="No datasets were found in the space. It may have been recently deleted.",
|
|
1504
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1505
|
+
)
|
|
1506
|
+
else:
|
|
1507
|
+
self.report.report_failure(
|
|
1508
|
+
title="Failed to Retrieve Datasets for Space",
|
|
1509
|
+
message=f"Unable to retrieve datasets for space token {space_token}.",
|
|
1510
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1511
|
+
)
|
|
1450
1512
|
|
|
1451
|
-
|
|
1452
|
-
def _get_queries(self, report_token: str) -> list:
|
|
1453
|
-
queries = []
|
|
1513
|
+
def _get_queries(self, report_token: str) -> List[dict]:
|
|
1454
1514
|
try:
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1515
|
+
with self.report.query_get_timer:
|
|
1516
|
+
# This endpoint does not handle pagination properly
|
|
1517
|
+
queries = self._get_request_json(
|
|
1518
|
+
f"{self.workspace_uri}/reports/{report_token}/queries"
|
|
1519
|
+
)
|
|
1520
|
+
self.report.query_get_api_called += 1
|
|
1521
|
+
logger.debug(
|
|
1522
|
+
f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
|
|
1523
|
+
)
|
|
1524
|
+
return queries.get("_embedded", {}).get("queries", [])
|
|
1459
1525
|
except ModeRequestError as e:
|
|
1460
1526
|
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1461
1527
|
self.report.report_warning(
|
|
@@ -1469,44 +1535,53 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1469
1535
|
message="Unable to retrieve queries for report token.",
|
|
1470
1536
|
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1471
1537
|
)
|
|
1472
|
-
|
|
1538
|
+
return []
|
|
1473
1539
|
|
|
1474
1540
|
@lru_cache(maxsize=None)
|
|
1475
|
-
def _get_last_query_run(
|
|
1476
|
-
|
|
1477
|
-
|
|
1541
|
+
def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
|
|
1542
|
+
# This function is unused and may be subject to removal in a future revision of this source
|
|
1543
|
+
query_runs = []
|
|
1478
1544
|
try:
|
|
1479
|
-
|
|
1480
|
-
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs
|
|
1481
|
-
|
|
1482
|
-
|
|
1545
|
+
for query_run_page in self._get_paged_request_json(
|
|
1546
|
+
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
|
|
1547
|
+
"query_runs",
|
|
1548
|
+
self.config.items_per_page,
|
|
1549
|
+
):
|
|
1550
|
+
query_runs.extend(query_run_page)
|
|
1483
1551
|
except ModeRequestError as e:
|
|
1484
1552
|
self.report.report_failure(
|
|
1485
1553
|
title="Failed to Retrieve Queries for Report",
|
|
1486
1554
|
message="Unable to retrieve queries for report token.",
|
|
1487
1555
|
context=f"Report Token:{report_token}, Error: {str(e)}",
|
|
1488
1556
|
)
|
|
1489
|
-
|
|
1490
|
-
return queries
|
|
1557
|
+
return query_runs
|
|
1491
1558
|
|
|
1492
|
-
|
|
1493
|
-
def _get_charts(self, report_token: str, query_token: str) -> list:
|
|
1494
|
-
charts = []
|
|
1559
|
+
def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
|
|
1495
1560
|
try:
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1561
|
+
with self.report.chart_get_timer:
|
|
1562
|
+
# This endpoint does not handle pagination properly
|
|
1563
|
+
charts = self._get_request_json(
|
|
1564
|
+
f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
|
|
1565
|
+
)
|
|
1566
|
+
self.report.chart_get_api_called += 1
|
|
1567
|
+
logger.debug(
|
|
1568
|
+
f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
|
|
1569
|
+
)
|
|
1570
|
+
return charts.get("_embedded", {}).get("charts", [])
|
|
1501
1571
|
except ModeRequestError as e:
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1572
|
+
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1573
|
+
self.report.report_warning(
|
|
1574
|
+
title="No Charts Found for Query",
|
|
1575
|
+
message="No charts were found for the query. The query may have been recently deleted.",
|
|
1576
|
+
context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
|
|
1577
|
+
)
|
|
1578
|
+
else:
|
|
1579
|
+
self.report.report_failure(
|
|
1580
|
+
title="Failed to Retrieve Charts",
|
|
1581
|
+
message="Unable to retrieve charts from Mode.",
|
|
1582
|
+
context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
|
|
1583
|
+
)
|
|
1584
|
+
return []
|
|
1510
1585
|
|
|
1511
1586
|
def _get_paged_request_json(
|
|
1512
1587
|
self, url: str, key: str, per_page: int
|
|
@@ -1521,6 +1596,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1521
1596
|
yield data
|
|
1522
1597
|
page += 1
|
|
1523
1598
|
|
|
1599
|
+
@lru_cache(maxsize=None)
|
|
1524
1600
|
def _get_request_json(self, url: str) -> Dict:
|
|
1525
1601
|
r = tenacity.Retrying(
|
|
1526
1602
|
wait=wait_exponential(
|
|
@@ -1568,6 +1644,17 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1568
1644
|
|
|
1569
1645
|
return get_request()
|
|
1570
1646
|
|
|
1647
|
+
@staticmethod
|
|
1648
|
+
def _get_process_memory():
|
|
1649
|
+
process = psutil.Process(os.getpid())
|
|
1650
|
+
mem_info = process.memory_info()
|
|
1651
|
+
return {
|
|
1652
|
+
"rss": mem_info.rss / (1024 * 1024),
|
|
1653
|
+
"vms": mem_info.vms / (1024 * 1024),
|
|
1654
|
+
"shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
|
|
1655
|
+
"data": getattr(mem_info, "data", 0) / (1024 * 1024),
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1571
1658
|
@staticmethod
|
|
1572
1659
|
def create_embed_aspect_mcp(
|
|
1573
1660
|
entity_urn: str, embed_url: str
|
|
@@ -1603,115 +1690,116 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1603
1690
|
yield from self.construct_space_container(space_token, space_name)
|
|
1604
1691
|
space_container_key = self.gen_space_key(space_token)
|
|
1605
1692
|
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
if dashboard_tuple_from_report is None:
|
|
1616
|
-
continue
|
|
1617
|
-
(
|
|
1618
|
-
dashboard_snapshot_from_report,
|
|
1619
|
-
browse_mcpw,
|
|
1620
|
-
) = dashboard_tuple_from_report
|
|
1693
|
+
for report_page in self._get_reports(space_token):
|
|
1694
|
+
for report in report_page:
|
|
1695
|
+
logger.debug(
|
|
1696
|
+
f"Report: name: {report.get('name')} token: {report.get('token')}"
|
|
1697
|
+
)
|
|
1698
|
+
dashboard_tuple_from_report = self.construct_dashboard(
|
|
1699
|
+
space_token=space_token, report_info=report
|
|
1700
|
+
)
|
|
1621
1701
|
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1702
|
+
if dashboard_tuple_from_report is None:
|
|
1703
|
+
continue
|
|
1704
|
+
(
|
|
1705
|
+
dashboard_snapshot_from_report,
|
|
1706
|
+
browse_mcpw,
|
|
1707
|
+
) = dashboard_tuple_from_report
|
|
1625
1708
|
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
)
|
|
1630
|
-
yield mcpw.as_workunit()
|
|
1631
|
-
yield from add_dataset_to_container(
|
|
1632
|
-
container_key=space_container_key,
|
|
1633
|
-
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1634
|
-
)
|
|
1635
|
-
yield browse_mcpw.as_workunit()
|
|
1709
|
+
mce = MetadataChangeEvent(
|
|
1710
|
+
proposedSnapshot=dashboard_snapshot_from_report
|
|
1711
|
+
)
|
|
1636
1712
|
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1713
|
+
mcpw = MetadataChangeProposalWrapper(
|
|
1714
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1715
|
+
aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
|
|
1716
|
+
)
|
|
1717
|
+
yield mcpw.as_workunit()
|
|
1718
|
+
yield from add_dataset_to_container(
|
|
1719
|
+
container_key=space_container_key,
|
|
1720
|
+
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1721
|
+
)
|
|
1722
|
+
yield browse_mcpw.as_workunit()
|
|
1641
1723
|
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1724
|
+
usage_statistics = DashboardUsageStatisticsClass(
|
|
1725
|
+
timestampMillis=round(datetime.now().timestamp() * 1000),
|
|
1726
|
+
viewsCount=report.get("view_count", 0),
|
|
1727
|
+
)
|
|
1646
1728
|
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1729
|
+
yield MetadataChangeProposalWrapper(
|
|
1730
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1731
|
+
aspect=usage_statistics,
|
|
1651
1732
|
).as_workunit()
|
|
1652
1733
|
|
|
1653
|
-
|
|
1734
|
+
if self.config.ingest_embed_url is True:
|
|
1735
|
+
yield self.create_embed_aspect_mcp(
|
|
1736
|
+
entity_urn=dashboard_snapshot_from_report.urn,
|
|
1737
|
+
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1738
|
+
).as_workunit()
|
|
1739
|
+
|
|
1740
|
+
yield MetadataWorkUnit(
|
|
1741
|
+
id=dashboard_snapshot_from_report.urn, mce=mce
|
|
1742
|
+
)
|
|
1654
1743
|
|
|
1655
1744
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1656
1745
|
# Space/collection -> report -> query -> Chart
|
|
1657
1746
|
for space_token in self.space_tokens:
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
query,
|
|
1667
|
-
space_token=space_token,
|
|
1668
|
-
report_info=report,
|
|
1669
|
-
is_mode_dataset=False,
|
|
1670
|
-
)
|
|
1671
|
-
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1672
|
-
for wu in query_mcps:
|
|
1673
|
-
if isinstance(
|
|
1674
|
-
wu.metadata, MetadataChangeProposalWrapper
|
|
1675
|
-
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1676
|
-
schema_metadata = wu.metadata.aspect
|
|
1677
|
-
for field in schema_metadata.fields:
|
|
1678
|
-
chart_fields.setdefault(field.fieldPath, field)
|
|
1679
|
-
|
|
1680
|
-
yield wu
|
|
1681
|
-
|
|
1682
|
-
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1683
|
-
# build charts
|
|
1684
|
-
for i, chart in enumerate(charts):
|
|
1685
|
-
yield from self.construct_chart_from_api_data(
|
|
1686
|
-
i,
|
|
1687
|
-
chart,
|
|
1688
|
-
chart_fields,
|
|
1747
|
+
for report_page in self._get_reports(space_token):
|
|
1748
|
+
for report in report_page:
|
|
1749
|
+
report_token = report.get("token", "")
|
|
1750
|
+
|
|
1751
|
+
queries = self._get_queries(report_token)
|
|
1752
|
+
for query in queries:
|
|
1753
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1754
|
+
report_token,
|
|
1689
1755
|
query,
|
|
1690
1756
|
space_token=space_token,
|
|
1691
1757
|
report_info=report,
|
|
1692
|
-
|
|
1758
|
+
is_mode_dataset=False,
|
|
1693
1759
|
)
|
|
1760
|
+
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1761
|
+
for wu in query_mcps:
|
|
1762
|
+
if isinstance(
|
|
1763
|
+
wu.metadata, MetadataChangeProposalWrapper
|
|
1764
|
+
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1765
|
+
schema_metadata = wu.metadata.aspect
|
|
1766
|
+
for field in schema_metadata.fields:
|
|
1767
|
+
chart_fields.setdefault(field.fieldPath, field)
|
|
1768
|
+
|
|
1769
|
+
yield wu
|
|
1770
|
+
|
|
1771
|
+
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1772
|
+
# build charts
|
|
1773
|
+
for i, chart in enumerate(charts):
|
|
1774
|
+
yield from self.construct_chart_from_api_data(
|
|
1775
|
+
i,
|
|
1776
|
+
chart,
|
|
1777
|
+
chart_fields,
|
|
1778
|
+
query,
|
|
1779
|
+
space_token=space_token,
|
|
1780
|
+
report_info=report,
|
|
1781
|
+
query_name=query["name"],
|
|
1782
|
+
)
|
|
1694
1783
|
|
|
1695
1784
|
def emit_dataset_mces(self):
|
|
1696
1785
|
"""
|
|
1697
1786
|
Emits MetadataChangeEvents (MCEs) for datasets within each space.
|
|
1698
1787
|
"""
|
|
1699
1788
|
for space_token, _ in self.space_tokens.items():
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
yield wu
|
|
1789
|
+
for dataset_page in self._get_datasets(space_token):
|
|
1790
|
+
for report in dataset_page:
|
|
1791
|
+
report_token = report.get("token", "")
|
|
1792
|
+
queries = self._get_queries(report_token)
|
|
1793
|
+
for query in queries:
|
|
1794
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1795
|
+
report_token,
|
|
1796
|
+
query,
|
|
1797
|
+
space_token=space_token,
|
|
1798
|
+
report_info=report,
|
|
1799
|
+
is_mode_dataset=True,
|
|
1800
|
+
)
|
|
1801
|
+
for wu in query_mcps:
|
|
1802
|
+
yield wu
|
|
1715
1803
|
|
|
1716
1804
|
@classmethod
|
|
1717
1805
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
|
|
@@ -1730,6 +1818,12 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1730
1818
|
yield from self.emit_dashboard_mces()
|
|
1731
1819
|
yield from self.emit_dataset_mces()
|
|
1732
1820
|
yield from self.emit_chart_mces()
|
|
1821
|
+
cache_info = self._get_request_json.cache_info()
|
|
1822
|
+
self.report.get_cache_hits = cache_info.hits
|
|
1823
|
+
self.report.get_cache_misses = cache_info.misses
|
|
1824
|
+
self.report.get_cache_size = cache_info.currsize
|
|
1825
|
+
memory_used = self._get_process_memory()
|
|
1826
|
+
self.report.process_memory_used_mb = round(memory_used["rss"], 2)
|
|
1733
1827
|
|
|
1734
1828
|
def get_report(self) -> SourceReport:
|
|
1735
1829
|
return self.report
|
|
@@ -13,6 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
|
13
13
|
|
|
14
14
|
import datahub.metadata.schema_classes as models
|
|
15
15
|
from datahub.configuration.common import AllowDenyPattern
|
|
16
|
+
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
16
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
19
|
from datahub.ingestion.api.decorators import (
|
|
@@ -60,6 +61,15 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
|
|
|
60
61
|
register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
|
|
61
62
|
register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
|
|
62
63
|
|
|
64
|
+
# Patterns copied from Snowflake source
|
|
65
|
+
DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
66
|
+
r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
|
|
67
|
+
r".*__DBT_TMP$", # dbt
|
|
68
|
+
rf".*\.SEGMENT_{UUID_REGEX}", # segment
|
|
69
|
+
rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
|
|
70
|
+
r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
|
|
71
|
+
]
|
|
72
|
+
|
|
63
73
|
|
|
64
74
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
65
75
|
# defaults
|
|
@@ -114,6 +124,12 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
114
124
|
default=False,
|
|
115
125
|
description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
|
|
116
126
|
)
|
|
127
|
+
temporary_tables_pattern: List[str] = Field(
|
|
128
|
+
default=DEFAULT_TEMP_TABLES_PATTERNS,
|
|
129
|
+
description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
|
|
130
|
+
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
131
|
+
"to ignore the temporary staging tables created by known ETL tools.",
|
|
132
|
+
)
|
|
117
133
|
|
|
118
134
|
@pydantic.validator("uri_args")
|
|
119
135
|
def passwords_match(cls, v, values, **kwargs):
|
|
@@ -179,6 +195,14 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
179
195
|
self.table_descriptions: Dict[str, str] = {}
|
|
180
196
|
self.column_descriptions: Dict[str, str] = {}
|
|
181
197
|
self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
|
|
198
|
+
|
|
199
|
+
self.report = SQLSourceReport()
|
|
200
|
+
if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
|
|
201
|
+
self.report.warning(
|
|
202
|
+
title="Potential issue with lineage",
|
|
203
|
+
message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
|
|
204
|
+
)
|
|
205
|
+
|
|
182
206
|
if self.config.include_descriptions:
|
|
183
207
|
for inspector in self.get_inspectors():
|
|
184
208
|
db_name: str = self.get_db_name(inspector)
|
|
@@ -774,6 +798,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
774
798
|
)
|
|
775
799
|
|
|
776
800
|
def is_temp_table(self, name: str) -> bool:
|
|
801
|
+
if any(
|
|
802
|
+
re.match(pattern, name, flags=re.IGNORECASE)
|
|
803
|
+
for pattern in self.config.temporary_tables_pattern
|
|
804
|
+
):
|
|
805
|
+
logger.debug(f"temp table matched by pattern {name}")
|
|
806
|
+
return True
|
|
807
|
+
|
|
777
808
|
try:
|
|
778
809
|
parts = name.split(".")
|
|
779
810
|
table_name = parts[-1]
|