acryl-datahub 1.0.0.4rc6__py3-none-any.whl → 1.0.0.4rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/METADATA +2471 -2471
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/RECORD +21 -21
- datahub/_version.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_api.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +15 -0
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/hex/query_fetcher.py +1 -0
- datahub/ingestion/source/mode.py +232 -157
- datahub/ingestion/source/sql/mssql/source.py +31 -0
- datahub/ingestion/source/sql/presto.py +18 -1
- datahub/ingestion/source/sql/trino.py +28 -6
- datahub/metadata/_internal_schema_classes.py +476 -476
- datahub/metadata/_urns/urn_defs.py +1703 -1703
- datahub/metadata/schema.avsc +16229 -16229
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +17 -1
- datahub/sql_parsing/sqlglot_lineage.py +220 -9
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.4rc6.dist-info → acryl_datahub-1.0.0.4rc8.dist-info}/top_level.txt +0 -0
datahub/ingestion/source/mode.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
import re
|
|
4
5
|
import time
|
|
5
6
|
from dataclasses import dataclass
|
|
@@ -9,6 +10,7 @@ from json import JSONDecodeError
|
|
|
9
10
|
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
|
10
11
|
|
|
11
12
|
import dateutil.parser as dp
|
|
13
|
+
import psutil
|
|
12
14
|
import pydantic
|
|
13
15
|
import requests
|
|
14
16
|
import sqlglot
|
|
@@ -114,8 +116,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
114
116
|
)
|
|
115
117
|
from datahub.utilities import config_clean
|
|
116
118
|
from datahub.utilities.lossy_collections import LossyList
|
|
119
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
117
120
|
|
|
118
121
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
122
|
+
# Default API limit for items returned per API call
|
|
123
|
+
# Used for the default per_page value for paginated API requests
|
|
124
|
+
DEFAULT_API_ITEMS_PER_PAGE = 30
|
|
119
125
|
|
|
120
126
|
|
|
121
127
|
class SpaceKey(ContainerKey):
|
|
@@ -194,10 +200,25 @@ class ModeConfig(
|
|
|
194
200
|
default=True, description="Tag measures and dimensions in the schema"
|
|
195
201
|
)
|
|
196
202
|
|
|
203
|
+
items_per_page: int = Field(
|
|
204
|
+
default=DEFAULT_API_ITEMS_PER_PAGE,
|
|
205
|
+
description="Number of items per page for paginated API requests.",
|
|
206
|
+
hidden_from_docs=True,
|
|
207
|
+
)
|
|
208
|
+
|
|
197
209
|
@validator("connect_uri")
|
|
198
210
|
def remove_trailing_slash(cls, v):
|
|
199
211
|
return config_clean.remove_trailing_slashes(v)
|
|
200
212
|
|
|
213
|
+
@validator("items_per_page")
|
|
214
|
+
def validate_items_per_page(cls, v):
|
|
215
|
+
if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
|
|
216
|
+
return v
|
|
217
|
+
else:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
|
|
220
|
+
)
|
|
221
|
+
|
|
201
222
|
|
|
202
223
|
class HTTPError429(HTTPError):
|
|
203
224
|
pass
|
|
@@ -224,6 +245,20 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
|
224
245
|
num_requests_exceeding_rate_limit: int = 0
|
|
225
246
|
num_requests_retried_on_timeout: int = 0
|
|
226
247
|
num_spaces_retrieved: int = 0
|
|
248
|
+
space_get_api_called: int = 0
|
|
249
|
+
report_get_api_called: int = 0
|
|
250
|
+
dataset_get_api_called: int = 0
|
|
251
|
+
query_get_api_called: int = 0
|
|
252
|
+
chart_get_api_called: int = 0
|
|
253
|
+
get_cache_hits: int = 0
|
|
254
|
+
get_cache_misses: int = 0
|
|
255
|
+
get_cache_size: int = 0
|
|
256
|
+
process_memory_used_mb: float = 0
|
|
257
|
+
space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
258
|
+
report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
259
|
+
dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
260
|
+
query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
261
|
+
chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
|
227
262
|
|
|
228
263
|
def report_dropped_space(self, ent_name: str) -> None:
|
|
229
264
|
self.filtered_spaces.append(ent_name)
|
|
@@ -583,34 +618,38 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
583
618
|
space_info = {}
|
|
584
619
|
try:
|
|
585
620
|
logger.debug(f"Retrieving spaces for {self.workspace_uri}")
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
)
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
|
|
621
|
+
with self.report.space_get_timer:
|
|
622
|
+
for spaces_page in self._get_paged_request_json(
|
|
623
|
+
f"{self.workspace_uri}/spaces?filter=all",
|
|
624
|
+
"spaces",
|
|
625
|
+
self.config.items_per_page,
|
|
626
|
+
):
|
|
627
|
+
self.report.space_get_api_called += 1
|
|
628
|
+
logger.debug(
|
|
629
|
+
f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
|
|
630
|
+
)
|
|
631
|
+
self.report.num_spaces_retrieved += len(spaces_page)
|
|
632
|
+
for s in spaces_page:
|
|
633
|
+
logger.debug(f"Space: {s.get('name')}")
|
|
634
|
+
space_name = s.get("name", "")
|
|
635
|
+
# Using both restricted and default_access_level because
|
|
636
|
+
# there is a current bug with restricted returning False everytime
|
|
637
|
+
# which has been reported to Mode team
|
|
638
|
+
if self.config.exclude_restricted and (
|
|
639
|
+
s.get("restricted")
|
|
640
|
+
or s.get("default_access_level") == "restricted"
|
|
641
|
+
):
|
|
642
|
+
logging.debug(
|
|
643
|
+
f"Skipping space {space_name} due to exclude restricted"
|
|
644
|
+
)
|
|
645
|
+
continue
|
|
646
|
+
if not self.config.space_pattern.allowed(space_name):
|
|
647
|
+
self.report.report_dropped_space(space_name)
|
|
648
|
+
logging.debug(
|
|
649
|
+
f"Skipping space {space_name} due to space pattern"
|
|
650
|
+
)
|
|
651
|
+
continue
|
|
652
|
+
space_info[s.get("token", "")] = s.get("name", "")
|
|
614
653
|
except ModeRequestError as e:
|
|
615
654
|
self.report.report_failure(
|
|
616
655
|
title="Failed to Retrieve Spaces",
|
|
@@ -1414,48 +1453,61 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1414
1453
|
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
1415
1454
|
yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
|
|
1416
1455
|
|
|
1417
|
-
|
|
1418
|
-
def _get_reports(self, space_token: str) -> List[dict]:
|
|
1419
|
-
reports = []
|
|
1456
|
+
def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
|
|
1420
1457
|
try:
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1458
|
+
with self.report.report_get_timer:
|
|
1459
|
+
for reports_page in self._get_paged_request_json(
|
|
1460
|
+
f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
|
|
1461
|
+
"reports",
|
|
1462
|
+
self.config.items_per_page,
|
|
1463
|
+
):
|
|
1464
|
+
self.report.report_get_api_called += 1
|
|
1465
|
+
logger.debug(
|
|
1466
|
+
f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
|
|
1467
|
+
)
|
|
1468
|
+
yield reports_page
|
|
1425
1469
|
except ModeRequestError as e:
|
|
1426
1470
|
self.report.report_failure(
|
|
1427
1471
|
title="Failed to Retrieve Reports for Space",
|
|
1428
1472
|
message="Unable to retrieve reports for space token.",
|
|
1429
1473
|
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1430
1474
|
)
|
|
1431
|
-
return reports
|
|
1432
1475
|
|
|
1433
|
-
|
|
1434
|
-
def _get_datasets(self, space_token: str) -> List[dict]:
|
|
1476
|
+
def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
|
|
1435
1477
|
"""
|
|
1436
1478
|
Retrieves datasets for a given space token.
|
|
1437
1479
|
"""
|
|
1438
|
-
datasets = []
|
|
1439
1480
|
try:
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1481
|
+
with self.report.dataset_get_timer:
|
|
1482
|
+
for dataset_page in self._get_paged_request_json(
|
|
1483
|
+
f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
|
|
1484
|
+
"reports",
|
|
1485
|
+
self.config.items_per_page,
|
|
1486
|
+
):
|
|
1487
|
+
self.report.dataset_get_api_called += 1
|
|
1488
|
+
logger.debug(
|
|
1489
|
+
f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
|
|
1490
|
+
)
|
|
1491
|
+
yield dataset_page
|
|
1443
1492
|
except ModeRequestError as e:
|
|
1444
1493
|
self.report.report_failure(
|
|
1445
1494
|
title="Failed to Retrieve Datasets for Space",
|
|
1446
1495
|
message=f"Unable to retrieve datasets for space token {space_token}.",
|
|
1447
1496
|
context=f"Error: {str(e)}",
|
|
1448
1497
|
)
|
|
1449
|
-
return datasets
|
|
1450
1498
|
|
|
1451
|
-
|
|
1452
|
-
def _get_queries(self, report_token: str) -> list:
|
|
1453
|
-
queries = []
|
|
1499
|
+
def _get_queries(self, report_token: str) -> List[dict]:
|
|
1454
1500
|
try:
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1501
|
+
with self.report.query_get_timer:
|
|
1502
|
+
# This endpoint does not handle pagination properly
|
|
1503
|
+
queries = self._get_request_json(
|
|
1504
|
+
f"{self.workspace_uri}/reports/{report_token}/queries"
|
|
1505
|
+
)
|
|
1506
|
+
self.report.query_get_api_called += 1
|
|
1507
|
+
logger.debug(
|
|
1508
|
+
f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
|
|
1509
|
+
)
|
|
1510
|
+
return queries.get("_embedded", {}).get("queries", [])
|
|
1459
1511
|
except ModeRequestError as e:
|
|
1460
1512
|
if isinstance(e, HTTPError) and e.response.status_code == 404:
|
|
1461
1513
|
self.report.report_warning(
|
|
@@ -1469,35 +1521,39 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1469
1521
|
message="Unable to retrieve queries for report token.",
|
|
1470
1522
|
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1471
1523
|
)
|
|
1472
|
-
|
|
1524
|
+
return []
|
|
1473
1525
|
|
|
1474
1526
|
@lru_cache(maxsize=None)
|
|
1475
|
-
def _get_last_query_run(
|
|
1476
|
-
|
|
1477
|
-
|
|
1527
|
+
def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
|
|
1528
|
+
# This function is unused and may be subject to removal in a future revision of this source
|
|
1529
|
+
query_runs = []
|
|
1478
1530
|
try:
|
|
1479
|
-
|
|
1480
|
-
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs
|
|
1481
|
-
|
|
1482
|
-
|
|
1531
|
+
for query_run_page in self._get_paged_request_json(
|
|
1532
|
+
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
|
|
1533
|
+
"query_runs",
|
|
1534
|
+
self.config.items_per_page,
|
|
1535
|
+
):
|
|
1536
|
+
query_runs.extend(query_run_page)
|
|
1483
1537
|
except ModeRequestError as e:
|
|
1484
1538
|
self.report.report_failure(
|
|
1485
1539
|
title="Failed to Retrieve Queries for Report",
|
|
1486
1540
|
message="Unable to retrieve queries for report token.",
|
|
1487
1541
|
context=f"Report Token:{report_token}, Error: {str(e)}",
|
|
1488
1542
|
)
|
|
1489
|
-
|
|
1490
|
-
return queries
|
|
1543
|
+
return query_runs
|
|
1491
1544
|
|
|
1492
|
-
|
|
1493
|
-
def _get_charts(self, report_token: str, query_token: str) -> list:
|
|
1494
|
-
charts = []
|
|
1545
|
+
def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
|
|
1495
1546
|
try:
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1547
|
+
with self.report.chart_get_timer:
|
|
1548
|
+
# This endpoint does not handle pagination properly
|
|
1549
|
+
charts = self._get_request_json(
|
|
1550
|
+
f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
|
|
1551
|
+
)
|
|
1552
|
+
self.report.chart_get_api_called += 1
|
|
1553
|
+
logger.debug(
|
|
1554
|
+
f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
|
|
1555
|
+
)
|
|
1556
|
+
return charts.get("_embedded", {}).get("charts", [])
|
|
1501
1557
|
except ModeRequestError as e:
|
|
1502
1558
|
self.report.report_failure(
|
|
1503
1559
|
title="Failed to Retrieve Charts",
|
|
@@ -1506,7 +1562,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1506
1562
|
f"Query token: {query_token}, "
|
|
1507
1563
|
f"Error: {str(e)}",
|
|
1508
1564
|
)
|
|
1509
|
-
|
|
1565
|
+
return []
|
|
1510
1566
|
|
|
1511
1567
|
def _get_paged_request_json(
|
|
1512
1568
|
self, url: str, key: str, per_page: int
|
|
@@ -1521,6 +1577,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1521
1577
|
yield data
|
|
1522
1578
|
page += 1
|
|
1523
1579
|
|
|
1580
|
+
@lru_cache(maxsize=20480)
|
|
1524
1581
|
def _get_request_json(self, url: str) -> Dict:
|
|
1525
1582
|
r = tenacity.Retrying(
|
|
1526
1583
|
wait=wait_exponential(
|
|
@@ -1568,6 +1625,17 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1568
1625
|
|
|
1569
1626
|
return get_request()
|
|
1570
1627
|
|
|
1628
|
+
@staticmethod
|
|
1629
|
+
def _get_process_memory():
|
|
1630
|
+
process = psutil.Process(os.getpid())
|
|
1631
|
+
mem_info = process.memory_info()
|
|
1632
|
+
return {
|
|
1633
|
+
"rss": mem_info.rss / (1024 * 1024),
|
|
1634
|
+
"vms": mem_info.vms / (1024 * 1024),
|
|
1635
|
+
"shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
|
|
1636
|
+
"data": getattr(mem_info, "data", 0) / (1024 * 1024),
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1571
1639
|
@staticmethod
|
|
1572
1640
|
def create_embed_aspect_mcp(
|
|
1573
1641
|
entity_urn: str, embed_url: str
|
|
@@ -1603,115 +1671,116 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1603
1671
|
yield from self.construct_space_container(space_token, space_name)
|
|
1604
1672
|
space_container_key = self.gen_space_key(space_token)
|
|
1605
1673
|
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
if dashboard_tuple_from_report is None:
|
|
1616
|
-
continue
|
|
1617
|
-
(
|
|
1618
|
-
dashboard_snapshot_from_report,
|
|
1619
|
-
browse_mcpw,
|
|
1620
|
-
) = dashboard_tuple_from_report
|
|
1674
|
+
for report_page in self._get_reports(space_token):
|
|
1675
|
+
for report in report_page:
|
|
1676
|
+
logger.debug(
|
|
1677
|
+
f"Report: name: {report.get('name')} token: {report.get('token')}"
|
|
1678
|
+
)
|
|
1679
|
+
dashboard_tuple_from_report = self.construct_dashboard(
|
|
1680
|
+
space_token=space_token, report_info=report
|
|
1681
|
+
)
|
|
1621
1682
|
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1683
|
+
if dashboard_tuple_from_report is None:
|
|
1684
|
+
continue
|
|
1685
|
+
(
|
|
1686
|
+
dashboard_snapshot_from_report,
|
|
1687
|
+
browse_mcpw,
|
|
1688
|
+
) = dashboard_tuple_from_report
|
|
1625
1689
|
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
)
|
|
1630
|
-
yield mcpw.as_workunit()
|
|
1631
|
-
yield from add_dataset_to_container(
|
|
1632
|
-
container_key=space_container_key,
|
|
1633
|
-
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1634
|
-
)
|
|
1635
|
-
yield browse_mcpw.as_workunit()
|
|
1690
|
+
mce = MetadataChangeEvent(
|
|
1691
|
+
proposedSnapshot=dashboard_snapshot_from_report
|
|
1692
|
+
)
|
|
1636
1693
|
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1694
|
+
mcpw = MetadataChangeProposalWrapper(
|
|
1695
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1696
|
+
aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
|
|
1697
|
+
)
|
|
1698
|
+
yield mcpw.as_workunit()
|
|
1699
|
+
yield from add_dataset_to_container(
|
|
1700
|
+
container_key=space_container_key,
|
|
1701
|
+
dataset_urn=dashboard_snapshot_from_report.urn,
|
|
1702
|
+
)
|
|
1703
|
+
yield browse_mcpw.as_workunit()
|
|
1641
1704
|
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1705
|
+
usage_statistics = DashboardUsageStatisticsClass(
|
|
1706
|
+
timestampMillis=round(datetime.now().timestamp() * 1000),
|
|
1707
|
+
viewsCount=report.get("view_count", 0),
|
|
1708
|
+
)
|
|
1646
1709
|
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1710
|
+
yield MetadataChangeProposalWrapper(
|
|
1711
|
+
entityUrn=dashboard_snapshot_from_report.urn,
|
|
1712
|
+
aspect=usage_statistics,
|
|
1651
1713
|
).as_workunit()
|
|
1652
1714
|
|
|
1653
|
-
|
|
1715
|
+
if self.config.ingest_embed_url is True:
|
|
1716
|
+
yield self.create_embed_aspect_mcp(
|
|
1717
|
+
entity_urn=dashboard_snapshot_from_report.urn,
|
|
1718
|
+
embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
|
|
1719
|
+
).as_workunit()
|
|
1720
|
+
|
|
1721
|
+
yield MetadataWorkUnit(
|
|
1722
|
+
id=dashboard_snapshot_from_report.urn, mce=mce
|
|
1723
|
+
)
|
|
1654
1724
|
|
|
1655
1725
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1656
1726
|
# Space/collection -> report -> query -> Chart
|
|
1657
1727
|
for space_token in self.space_tokens:
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
query,
|
|
1667
|
-
space_token=space_token,
|
|
1668
|
-
report_info=report,
|
|
1669
|
-
is_mode_dataset=False,
|
|
1670
|
-
)
|
|
1671
|
-
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1672
|
-
for wu in query_mcps:
|
|
1673
|
-
if isinstance(
|
|
1674
|
-
wu.metadata, MetadataChangeProposalWrapper
|
|
1675
|
-
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1676
|
-
schema_metadata = wu.metadata.aspect
|
|
1677
|
-
for field in schema_metadata.fields:
|
|
1678
|
-
chart_fields.setdefault(field.fieldPath, field)
|
|
1679
|
-
|
|
1680
|
-
yield wu
|
|
1681
|
-
|
|
1682
|
-
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1683
|
-
# build charts
|
|
1684
|
-
for i, chart in enumerate(charts):
|
|
1685
|
-
yield from self.construct_chart_from_api_data(
|
|
1686
|
-
i,
|
|
1687
|
-
chart,
|
|
1688
|
-
chart_fields,
|
|
1728
|
+
for report_page in self._get_reports(space_token):
|
|
1729
|
+
for report in report_page:
|
|
1730
|
+
report_token = report.get("token", "")
|
|
1731
|
+
|
|
1732
|
+
queries = self._get_queries(report_token)
|
|
1733
|
+
for query in queries:
|
|
1734
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1735
|
+
report_token,
|
|
1689
1736
|
query,
|
|
1690
1737
|
space_token=space_token,
|
|
1691
1738
|
report_info=report,
|
|
1692
|
-
|
|
1739
|
+
is_mode_dataset=False,
|
|
1693
1740
|
)
|
|
1741
|
+
chart_fields: Dict[str, SchemaFieldClass] = {}
|
|
1742
|
+
for wu in query_mcps:
|
|
1743
|
+
if isinstance(
|
|
1744
|
+
wu.metadata, MetadataChangeProposalWrapper
|
|
1745
|
+
) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
|
|
1746
|
+
schema_metadata = wu.metadata.aspect
|
|
1747
|
+
for field in schema_metadata.fields:
|
|
1748
|
+
chart_fields.setdefault(field.fieldPath, field)
|
|
1749
|
+
|
|
1750
|
+
yield wu
|
|
1751
|
+
|
|
1752
|
+
charts = self._get_charts(report_token, query.get("token", ""))
|
|
1753
|
+
# build charts
|
|
1754
|
+
for i, chart in enumerate(charts):
|
|
1755
|
+
yield from self.construct_chart_from_api_data(
|
|
1756
|
+
i,
|
|
1757
|
+
chart,
|
|
1758
|
+
chart_fields,
|
|
1759
|
+
query,
|
|
1760
|
+
space_token=space_token,
|
|
1761
|
+
report_info=report,
|
|
1762
|
+
query_name=query["name"],
|
|
1763
|
+
)
|
|
1694
1764
|
|
|
1695
1765
|
def emit_dataset_mces(self):
|
|
1696
1766
|
"""
|
|
1697
1767
|
Emits MetadataChangeEvents (MCEs) for datasets within each space.
|
|
1698
1768
|
"""
|
|
1699
1769
|
for space_token, _ in self.space_tokens.items():
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
yield wu
|
|
1770
|
+
for dataset_page in self._get_datasets(space_token):
|
|
1771
|
+
for report in dataset_page:
|
|
1772
|
+
report_token = report.get("token", "")
|
|
1773
|
+
queries = self._get_queries(report_token)
|
|
1774
|
+
for query in queries:
|
|
1775
|
+
query_mcps = self.construct_query_or_dataset(
|
|
1776
|
+
report_token,
|
|
1777
|
+
query,
|
|
1778
|
+
space_token=space_token,
|
|
1779
|
+
report_info=report,
|
|
1780
|
+
is_mode_dataset=True,
|
|
1781
|
+
)
|
|
1782
|
+
for wu in query_mcps:
|
|
1783
|
+
yield wu
|
|
1715
1784
|
|
|
1716
1785
|
@classmethod
|
|
1717
1786
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
|
|
@@ -1730,6 +1799,12 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1730
1799
|
yield from self.emit_dashboard_mces()
|
|
1731
1800
|
yield from self.emit_dataset_mces()
|
|
1732
1801
|
yield from self.emit_chart_mces()
|
|
1802
|
+
cache_info = self._get_request_json.cache_info()
|
|
1803
|
+
self.report.get_cache_hits = cache_info.hits
|
|
1804
|
+
self.report.get_cache_misses = cache_info.misses
|
|
1805
|
+
self.report.get_cache_size = cache_info.currsize
|
|
1806
|
+
memory_used = self._get_process_memory()
|
|
1807
|
+
self.report.process_memory_used_mb = round(memory_used["rss"], 2)
|
|
1733
1808
|
|
|
1734
1809
|
def get_report(self) -> SourceReport:
|
|
1735
1810
|
return self.report
|
|
@@ -13,6 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
|
13
13
|
|
|
14
14
|
import datahub.metadata.schema_classes as models
|
|
15
15
|
from datahub.configuration.common import AllowDenyPattern
|
|
16
|
+
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
16
17
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
18
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
19
|
from datahub.ingestion.api.decorators import (
|
|
@@ -60,6 +61,15 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
|
|
|
60
61
|
register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
|
|
61
62
|
register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
|
|
62
63
|
|
|
64
|
+
# Patterns copied from Snowflake source
|
|
65
|
+
DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
66
|
+
r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
|
|
67
|
+
r".*__DBT_TMP$", # dbt
|
|
68
|
+
rf".*\.SEGMENT_{UUID_REGEX}", # segment
|
|
69
|
+
rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
|
|
70
|
+
r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
|
|
71
|
+
]
|
|
72
|
+
|
|
63
73
|
|
|
64
74
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
65
75
|
# defaults
|
|
@@ -114,6 +124,12 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
114
124
|
default=False,
|
|
115
125
|
description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
|
|
116
126
|
)
|
|
127
|
+
temporary_tables_pattern: List[str] = Field(
|
|
128
|
+
default=DEFAULT_TEMP_TABLES_PATTERNS,
|
|
129
|
+
description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
|
|
130
|
+
"match the entire table name in database.schema.table format. Defaults are to set in such a way "
|
|
131
|
+
"to ignore the temporary staging tables created by known ETL tools.",
|
|
132
|
+
)
|
|
117
133
|
|
|
118
134
|
@pydantic.validator("uri_args")
|
|
119
135
|
def passwords_match(cls, v, values, **kwargs):
|
|
@@ -179,6 +195,14 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
179
195
|
self.table_descriptions: Dict[str, str] = {}
|
|
180
196
|
self.column_descriptions: Dict[str, str] = {}
|
|
181
197
|
self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
|
|
198
|
+
|
|
199
|
+
self.report = SQLSourceReport()
|
|
200
|
+
if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
|
|
201
|
+
self.report.warning(
|
|
202
|
+
title="Potential issue with lineage",
|
|
203
|
+
message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
|
|
204
|
+
)
|
|
205
|
+
|
|
182
206
|
if self.config.include_descriptions:
|
|
183
207
|
for inspector in self.get_inspectors():
|
|
184
208
|
db_name: str = self.get_db_name(inspector)
|
|
@@ -774,6 +798,13 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
774
798
|
)
|
|
775
799
|
|
|
776
800
|
def is_temp_table(self, name: str) -> bool:
|
|
801
|
+
if any(
|
|
802
|
+
re.match(pattern, name, flags=re.IGNORECASE)
|
|
803
|
+
for pattern in self.config.temporary_tables_pattern
|
|
804
|
+
):
|
|
805
|
+
logger.debug(f"temp table matched by pattern {name}")
|
|
806
|
+
return True
|
|
807
|
+
|
|
777
808
|
try:
|
|
778
809
|
parts = name.split(".")
|
|
779
810
|
table_name = parts[-1]
|
|
@@ -1,10 +1,12 @@
|
|
|
1
|
+
import functools
|
|
1
2
|
from textwrap import dedent
|
|
2
|
-
from typing import Optional
|
|
3
|
+
from typing import Dict, Optional
|
|
3
4
|
|
|
4
5
|
from pydantic.fields import Field
|
|
5
6
|
from pyhive.sqlalchemy_presto import PrestoDialect
|
|
6
7
|
from sqlalchemy import exc, sql
|
|
7
8
|
from sqlalchemy.engine import reflection
|
|
9
|
+
from sqlalchemy.engine.base import Engine
|
|
8
10
|
|
|
9
11
|
from datahub.ingestion.api.common import PipelineContext
|
|
10
12
|
from datahub.ingestion.api.decorators import (
|
|
@@ -114,3 +116,18 @@ class PrestoSource(TrinoSource):
|
|
|
114
116
|
def create(cls, config_dict, ctx):
|
|
115
117
|
config = PrestoConfig.parse_obj(config_dict)
|
|
116
118
|
return cls(config, ctx)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Unfortunately, the Presto dialect provide catalog_name as a column
|
|
122
|
+
# therefore we we need some workaround to not fail.
|
|
123
|
+
# This is a workaround to not fail which casuses to only get the table comment as property which is still better than to fail.
|
|
124
|
+
@functools.lru_cache
|
|
125
|
+
def gen_catalog_connector_dict(engine: Engine) -> Dict[str, str]:
|
|
126
|
+
query = dedent(
|
|
127
|
+
"""
|
|
128
|
+
SELECT *
|
|
129
|
+
FROM "system"."metadata"."catalogs"
|
|
130
|
+
"""
|
|
131
|
+
).strip()
|
|
132
|
+
res = engine.execute(sql.text(query))
|
|
133
|
+
return {row.catalog_name: "" for row in res}
|