acryl-datahub 1.0.0.4rc7__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,5 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
+ import os
3
4
  import re
4
5
  import time
5
6
  from dataclasses import dataclass
@@ -9,6 +10,7 @@ from json import JSONDecodeError
9
10
  from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
11
 
11
12
  import dateutil.parser as dp
13
+ import psutil
12
14
  import pydantic
13
15
  import requests
14
16
  import sqlglot
@@ -114,8 +116,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
114
116
  )
115
117
  from datahub.utilities import config_clean
116
118
  from datahub.utilities.lossy_collections import LossyList
119
+ from datahub.utilities.perf_timer import PerfTimer
117
120
 
118
121
  logger: logging.Logger = logging.getLogger(__name__)
122
+ # Default API limit for items returned per API call
123
+ # Used for the default per_page value for paginated API requests
124
+ DEFAULT_API_ITEMS_PER_PAGE = 30
119
125
 
120
126
 
121
127
  class SpaceKey(ContainerKey):
@@ -194,10 +200,25 @@ class ModeConfig(
194
200
  default=True, description="Tag measures and dimensions in the schema"
195
201
  )
196
202
 
203
+ items_per_page: int = Field(
204
+ default=DEFAULT_API_ITEMS_PER_PAGE,
205
+ description="Number of items per page for paginated API requests.",
206
+ hidden_from_docs=True,
207
+ )
208
+
197
209
  @validator("connect_uri")
198
210
  def remove_trailing_slash(cls, v):
199
211
  return config_clean.remove_trailing_slashes(v)
200
212
 
213
+ @validator("items_per_page")
214
+ def validate_items_per_page(cls, v):
215
+ if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
216
+ return v
217
+ else:
218
+ raise ValueError(
219
+ f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
220
+ )
221
+
201
222
 
202
223
  class HTTPError429(HTTPError):
203
224
  pass
@@ -224,6 +245,20 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
224
245
  num_requests_exceeding_rate_limit: int = 0
225
246
  num_requests_retried_on_timeout: int = 0
226
247
  num_spaces_retrieved: int = 0
248
+ space_get_api_called: int = 0
249
+ report_get_api_called: int = 0
250
+ dataset_get_api_called: int = 0
251
+ query_get_api_called: int = 0
252
+ chart_get_api_called: int = 0
253
+ get_cache_hits: int = 0
254
+ get_cache_misses: int = 0
255
+ get_cache_size: int = 0
256
+ process_memory_used_mb: float = 0
257
+ space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
258
+ report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
259
+ dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
260
+ query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
261
+ chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
227
262
 
228
263
  def report_dropped_space(self, ent_name: str) -> None:
229
264
  self.filtered_spaces.append(ent_name)
@@ -583,34 +618,38 @@ class ModeSource(StatefulIngestionSourceBase):
583
618
  space_info = {}
584
619
  try:
585
620
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
586
- for spaces_page in self._get_paged_request_json(
587
- f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
588
- ):
589
- logger.debug(
590
- f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
591
- )
592
- self.report.num_spaces_retrieved += len(spaces_page)
593
- for s in spaces_page:
594
- logger.debug(f"Space: {s.get('name')}")
595
- space_name = s.get("name", "")
596
- # Using both restricted and default_access_level because
597
- # there is a current bug with restricted returning False everytime
598
- # which has been reported to Mode team
599
- if self.config.exclude_restricted and (
600
- s.get("restricted")
601
- or s.get("default_access_level") == "restricted"
602
- ):
603
- logging.debug(
604
- f"Skipping space {space_name} due to exclude restricted"
605
- )
606
- continue
607
- if not self.config.space_pattern.allowed(space_name):
608
- self.report.report_dropped_space(space_name)
609
- logging.debug(
610
- f"Skipping space {space_name} due to space pattern"
611
- )
612
- continue
613
- space_info[s.get("token", "")] = s.get("name", "")
621
+ with self.report.space_get_timer:
622
+ for spaces_page in self._get_paged_request_json(
623
+ f"{self.workspace_uri}/spaces?filter=all",
624
+ "spaces",
625
+ self.config.items_per_page,
626
+ ):
627
+ self.report.space_get_api_called += 1
628
+ logger.debug(
629
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
630
+ )
631
+ self.report.num_spaces_retrieved += len(spaces_page)
632
+ for s in spaces_page:
633
+ logger.debug(f"Space: {s.get('name')}")
634
+ space_name = s.get("name", "")
635
+ # Using both restricted and default_access_level because
636
+ # there is a current bug with restricted returning False everytime
637
+ # which has been reported to Mode team
638
+ if self.config.exclude_restricted and (
639
+ s.get("restricted")
640
+ or s.get("default_access_level") == "restricted"
641
+ ):
642
+ logging.debug(
643
+ f"Skipping space {space_name} due to exclude restricted"
644
+ )
645
+ continue
646
+ if not self.config.space_pattern.allowed(space_name):
647
+ self.report.report_dropped_space(space_name)
648
+ logging.debug(
649
+ f"Skipping space {space_name} due to space pattern"
650
+ )
651
+ continue
652
+ space_info[s.get("token", "")] = s.get("name", "")
614
653
  except ModeRequestError as e:
615
654
  self.report.report_failure(
616
655
  title="Failed to Retrieve Spaces",
@@ -1414,48 +1453,75 @@ class ModeSource(StatefulIngestionSourceBase):
1414
1453
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
1415
1454
  yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
1416
1455
 
1417
- @lru_cache(maxsize=None)
1418
- def _get_reports(self, space_token: str) -> List[dict]:
1419
- reports = []
1456
+ def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
1420
1457
  try:
1421
- reports_json = self._get_request_json(
1422
- f"{self.workspace_uri}/spaces/{space_token}/reports"
1423
- )
1424
- reports = reports_json.get("_embedded", {}).get("reports", {})
1458
+ with self.report.report_get_timer:
1459
+ for reports_page in self._get_paged_request_json(
1460
+ f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
1461
+ "reports",
1462
+ self.config.items_per_page,
1463
+ ):
1464
+ self.report.report_get_api_called += 1
1465
+ logger.debug(
1466
+ f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
1467
+ )
1468
+ yield reports_page
1425
1469
  except ModeRequestError as e:
1426
- self.report.report_failure(
1427
- title="Failed to Retrieve Reports for Space",
1428
- message="Unable to retrieve reports for space token.",
1429
- context=f"Space Token: {space_token}, Error: {str(e)}",
1430
- )
1431
- return reports
1470
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1471
+ self.report.report_warning(
1472
+ title="No Reports Found in Space",
1473
+ message="No reports were found in the space. It may have been recently deleted.",
1474
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1475
+ )
1476
+ else:
1477
+ self.report.report_failure(
1478
+ title="Failed to Retrieve Reports for Space",
1479
+ message="Unable to retrieve reports for space token.",
1480
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1481
+ )
1432
1482
 
1433
- @lru_cache(maxsize=None)
1434
- def _get_datasets(self, space_token: str) -> List[dict]:
1483
+ def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
1435
1484
  """
1436
1485
  Retrieves datasets for a given space token.
1437
1486
  """
1438
- datasets = []
1439
1487
  try:
1440
- url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
1441
- datasets_json = self._get_request_json(url)
1442
- datasets = datasets_json.get("_embedded", {}).get("reports", [])
1488
+ with self.report.dataset_get_timer:
1489
+ for dataset_page in self._get_paged_request_json(
1490
+ f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
1491
+ "reports",
1492
+ self.config.items_per_page,
1493
+ ):
1494
+ self.report.dataset_get_api_called += 1
1495
+ logger.debug(
1496
+ f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
1497
+ )
1498
+ yield dataset_page
1443
1499
  except ModeRequestError as e:
1444
- self.report.report_failure(
1445
- title="Failed to Retrieve Datasets for Space",
1446
- message=f"Unable to retrieve datasets for space token {space_token}.",
1447
- context=f"Error: {str(e)}",
1448
- )
1449
- return datasets
1500
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1501
+ self.report.report_warning(
1502
+ title="No Datasets Found in Space",
1503
+ message="No datasets were found in the space. It may have been recently deleted.",
1504
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1505
+ )
1506
+ else:
1507
+ self.report.report_failure(
1508
+ title="Failed to Retrieve Datasets for Space",
1509
+ message=f"Unable to retrieve datasets for space token {space_token}.",
1510
+ context=f"Space Token: {space_token}, Error: {str(e)}",
1511
+ )
1450
1512
 
1451
- @lru_cache(maxsize=None)
1452
- def _get_queries(self, report_token: str) -> list:
1453
- queries = []
1513
+ def _get_queries(self, report_token: str) -> List[dict]:
1454
1514
  try:
1455
- queries_json = self._get_request_json(
1456
- f"{self.workspace_uri}/reports/{report_token}/queries"
1457
- )
1458
- queries = queries_json.get("_embedded", {}).get("queries", {})
1515
+ with self.report.query_get_timer:
1516
+ # This endpoint does not handle pagination properly
1517
+ queries = self._get_request_json(
1518
+ f"{self.workspace_uri}/reports/{report_token}/queries"
1519
+ )
1520
+ self.report.query_get_api_called += 1
1521
+ logger.debug(
1522
+ f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
1523
+ )
1524
+ return queries.get("_embedded", {}).get("queries", [])
1459
1525
  except ModeRequestError as e:
1460
1526
  if isinstance(e, HTTPError) and e.response.status_code == 404:
1461
1527
  self.report.report_warning(
@@ -1469,44 +1535,53 @@ class ModeSource(StatefulIngestionSourceBase):
1469
1535
  message="Unable to retrieve queries for report token.",
1470
1536
  context=f"Report Token: {report_token}, Error: {str(e)}",
1471
1537
  )
1472
- return queries
1538
+ return []
1473
1539
 
1474
1540
  @lru_cache(maxsize=None)
1475
- def _get_last_query_run(
1476
- self, report_token: str, report_run_id: str, query_run_id: str
1477
- ) -> Dict:
1541
+ def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
1542
+ # This function is unused and may be subject to removal in a future revision of this source
1543
+ query_runs = []
1478
1544
  try:
1479
- queries_json = self._get_request_json(
1480
- f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
1481
- )
1482
- queries = queries_json.get("_embedded", {}).get("queries", {})
1545
+ for query_run_page in self._get_paged_request_json(
1546
+ f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
1547
+ "query_runs",
1548
+ self.config.items_per_page,
1549
+ ):
1550
+ query_runs.extend(query_run_page)
1483
1551
  except ModeRequestError as e:
1484
1552
  self.report.report_failure(
1485
1553
  title="Failed to Retrieve Queries for Report",
1486
1554
  message="Unable to retrieve queries for report token.",
1487
1555
  context=f"Report Token:{report_token}, Error: {str(e)}",
1488
1556
  )
1489
- return {}
1490
- return queries
1557
+ return query_runs
1491
1558
 
1492
- @lru_cache(maxsize=None)
1493
- def _get_charts(self, report_token: str, query_token: str) -> list:
1494
- charts = []
1559
+ def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
1495
1560
  try:
1496
- charts_json = self._get_request_json(
1497
- f"{self.workspace_uri}/reports/{report_token}"
1498
- f"/queries/{query_token}/charts"
1499
- )
1500
- charts = charts_json.get("_embedded", {}).get("charts", {})
1561
+ with self.report.chart_get_timer:
1562
+ # This endpoint does not handle pagination properly
1563
+ charts = self._get_request_json(
1564
+ f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
1565
+ )
1566
+ self.report.chart_get_api_called += 1
1567
+ logger.debug(
1568
+ f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
1569
+ )
1570
+ return charts.get("_embedded", {}).get("charts", [])
1501
1571
  except ModeRequestError as e:
1502
- self.report.report_failure(
1503
- title="Failed to Retrieve Charts",
1504
- message="Unable to retrieve charts from Mode.",
1505
- context=f"Report Token: {report_token}, "
1506
- f"Query token: {query_token}, "
1507
- f"Error: {str(e)}",
1508
- )
1509
- return charts
1572
+ if isinstance(e, HTTPError) and e.response.status_code == 404:
1573
+ self.report.report_warning(
1574
+ title="No Charts Found for Query",
1575
+ message="No charts were found for the query. The query may have been recently deleted.",
1576
+ context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
1577
+ )
1578
+ else:
1579
+ self.report.report_failure(
1580
+ title="Failed to Retrieve Charts",
1581
+ message="Unable to retrieve charts from Mode.",
1582
+ context=f"Report Token: {report_token}, Query Token: {query_token}, Error: {str(e)}",
1583
+ )
1584
+ return []
1510
1585
 
1511
1586
  def _get_paged_request_json(
1512
1587
  self, url: str, key: str, per_page: int
@@ -1521,6 +1596,7 @@ class ModeSource(StatefulIngestionSourceBase):
1521
1596
  yield data
1522
1597
  page += 1
1523
1598
 
1599
+ @lru_cache(maxsize=None)
1524
1600
  def _get_request_json(self, url: str) -> Dict:
1525
1601
  r = tenacity.Retrying(
1526
1602
  wait=wait_exponential(
@@ -1568,6 +1644,17 @@ class ModeSource(StatefulIngestionSourceBase):
1568
1644
 
1569
1645
  return get_request()
1570
1646
 
1647
+ @staticmethod
1648
+ def _get_process_memory():
1649
+ process = psutil.Process(os.getpid())
1650
+ mem_info = process.memory_info()
1651
+ return {
1652
+ "rss": mem_info.rss / (1024 * 1024),
1653
+ "vms": mem_info.vms / (1024 * 1024),
1654
+ "shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
1655
+ "data": getattr(mem_info, "data", 0) / (1024 * 1024),
1656
+ }
1657
+
1571
1658
  @staticmethod
1572
1659
  def create_embed_aspect_mcp(
1573
1660
  entity_urn: str, embed_url: str
@@ -1603,115 +1690,116 @@ class ModeSource(StatefulIngestionSourceBase):
1603
1690
  yield from self.construct_space_container(space_token, space_name)
1604
1691
  space_container_key = self.gen_space_key(space_token)
1605
1692
 
1606
- reports = self._get_reports(space_token)
1607
- for report in reports:
1608
- logger.debug(
1609
- f"Report: name: {report.get('name')} token: {report.get('token')}"
1610
- )
1611
- dashboard_tuple_from_report = self.construct_dashboard(
1612
- space_token=space_token, report_info=report
1613
- )
1614
-
1615
- if dashboard_tuple_from_report is None:
1616
- continue
1617
- (
1618
- dashboard_snapshot_from_report,
1619
- browse_mcpw,
1620
- ) = dashboard_tuple_from_report
1693
+ for report_page in self._get_reports(space_token):
1694
+ for report in report_page:
1695
+ logger.debug(
1696
+ f"Report: name: {report.get('name')} token: {report.get('token')}"
1697
+ )
1698
+ dashboard_tuple_from_report = self.construct_dashboard(
1699
+ space_token=space_token, report_info=report
1700
+ )
1621
1701
 
1622
- mce = MetadataChangeEvent(
1623
- proposedSnapshot=dashboard_snapshot_from_report
1624
- )
1702
+ if dashboard_tuple_from_report is None:
1703
+ continue
1704
+ (
1705
+ dashboard_snapshot_from_report,
1706
+ browse_mcpw,
1707
+ ) = dashboard_tuple_from_report
1625
1708
 
1626
- mcpw = MetadataChangeProposalWrapper(
1627
- entityUrn=dashboard_snapshot_from_report.urn,
1628
- aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1629
- )
1630
- yield mcpw.as_workunit()
1631
- yield from add_dataset_to_container(
1632
- container_key=space_container_key,
1633
- dataset_urn=dashboard_snapshot_from_report.urn,
1634
- )
1635
- yield browse_mcpw.as_workunit()
1709
+ mce = MetadataChangeEvent(
1710
+ proposedSnapshot=dashboard_snapshot_from_report
1711
+ )
1636
1712
 
1637
- usage_statistics = DashboardUsageStatisticsClass(
1638
- timestampMillis=round(datetime.now().timestamp() * 1000),
1639
- viewsCount=report.get("view_count", 0),
1640
- )
1713
+ mcpw = MetadataChangeProposalWrapper(
1714
+ entityUrn=dashboard_snapshot_from_report.urn,
1715
+ aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1716
+ )
1717
+ yield mcpw.as_workunit()
1718
+ yield from add_dataset_to_container(
1719
+ container_key=space_container_key,
1720
+ dataset_urn=dashboard_snapshot_from_report.urn,
1721
+ )
1722
+ yield browse_mcpw.as_workunit()
1641
1723
 
1642
- yield MetadataChangeProposalWrapper(
1643
- entityUrn=dashboard_snapshot_from_report.urn,
1644
- aspect=usage_statistics,
1645
- ).as_workunit()
1724
+ usage_statistics = DashboardUsageStatisticsClass(
1725
+ timestampMillis=round(datetime.now().timestamp() * 1000),
1726
+ viewsCount=report.get("view_count", 0),
1727
+ )
1646
1728
 
1647
- if self.config.ingest_embed_url is True:
1648
- yield self.create_embed_aspect_mcp(
1649
- entity_urn=dashboard_snapshot_from_report.urn,
1650
- embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1729
+ yield MetadataChangeProposalWrapper(
1730
+ entityUrn=dashboard_snapshot_from_report.urn,
1731
+ aspect=usage_statistics,
1651
1732
  ).as_workunit()
1652
1733
 
1653
- yield MetadataWorkUnit(id=dashboard_snapshot_from_report.urn, mce=mce)
1734
+ if self.config.ingest_embed_url is True:
1735
+ yield self.create_embed_aspect_mcp(
1736
+ entity_urn=dashboard_snapshot_from_report.urn,
1737
+ embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1738
+ ).as_workunit()
1739
+
1740
+ yield MetadataWorkUnit(
1741
+ id=dashboard_snapshot_from_report.urn, mce=mce
1742
+ )
1654
1743
 
1655
1744
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1656
1745
  # Space/collection -> report -> query -> Chart
1657
1746
  for space_token in self.space_tokens:
1658
- reports = self._get_reports(space_token)
1659
- for report in reports:
1660
- report_token = report.get("token", "")
1661
-
1662
- queries = self._get_queries(report_token)
1663
- for query in queries:
1664
- query_mcps = self.construct_query_or_dataset(
1665
- report_token,
1666
- query,
1667
- space_token=space_token,
1668
- report_info=report,
1669
- is_mode_dataset=False,
1670
- )
1671
- chart_fields: Dict[str, SchemaFieldClass] = {}
1672
- for wu in query_mcps:
1673
- if isinstance(
1674
- wu.metadata, MetadataChangeProposalWrapper
1675
- ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1676
- schema_metadata = wu.metadata.aspect
1677
- for field in schema_metadata.fields:
1678
- chart_fields.setdefault(field.fieldPath, field)
1679
-
1680
- yield wu
1681
-
1682
- charts = self._get_charts(report_token, query.get("token", ""))
1683
- # build charts
1684
- for i, chart in enumerate(charts):
1685
- yield from self.construct_chart_from_api_data(
1686
- i,
1687
- chart,
1688
- chart_fields,
1747
+ for report_page in self._get_reports(space_token):
1748
+ for report in report_page:
1749
+ report_token = report.get("token", "")
1750
+
1751
+ queries = self._get_queries(report_token)
1752
+ for query in queries:
1753
+ query_mcps = self.construct_query_or_dataset(
1754
+ report_token,
1689
1755
  query,
1690
1756
  space_token=space_token,
1691
1757
  report_info=report,
1692
- query_name=query["name"],
1758
+ is_mode_dataset=False,
1693
1759
  )
1760
+ chart_fields: Dict[str, SchemaFieldClass] = {}
1761
+ for wu in query_mcps:
1762
+ if isinstance(
1763
+ wu.metadata, MetadataChangeProposalWrapper
1764
+ ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1765
+ schema_metadata = wu.metadata.aspect
1766
+ for field in schema_metadata.fields:
1767
+ chart_fields.setdefault(field.fieldPath, field)
1768
+
1769
+ yield wu
1770
+
1771
+ charts = self._get_charts(report_token, query.get("token", ""))
1772
+ # build charts
1773
+ for i, chart in enumerate(charts):
1774
+ yield from self.construct_chart_from_api_data(
1775
+ i,
1776
+ chart,
1777
+ chart_fields,
1778
+ query,
1779
+ space_token=space_token,
1780
+ report_info=report,
1781
+ query_name=query["name"],
1782
+ )
1694
1783
 
1695
1784
  def emit_dataset_mces(self):
1696
1785
  """
1697
1786
  Emits MetadataChangeEvents (MCEs) for datasets within each space.
1698
1787
  """
1699
1788
  for space_token, _ in self.space_tokens.items():
1700
- datasets = self._get_datasets(space_token)
1701
-
1702
- for report in datasets:
1703
- report_token = report.get("token", "")
1704
- queries = self._get_queries(report_token)
1705
- for query in queries:
1706
- query_mcps = self.construct_query_or_dataset(
1707
- report_token,
1708
- query,
1709
- space_token=space_token,
1710
- report_info=report,
1711
- is_mode_dataset=True,
1712
- )
1713
- for wu in query_mcps:
1714
- yield wu
1789
+ for dataset_page in self._get_datasets(space_token):
1790
+ for report in dataset_page:
1791
+ report_token = report.get("token", "")
1792
+ queries = self._get_queries(report_token)
1793
+ for query in queries:
1794
+ query_mcps = self.construct_query_or_dataset(
1795
+ report_token,
1796
+ query,
1797
+ space_token=space_token,
1798
+ report_info=report,
1799
+ is_mode_dataset=True,
1800
+ )
1801
+ for wu in query_mcps:
1802
+ yield wu
1715
1803
 
1716
1804
  @classmethod
1717
1805
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
@@ -1730,6 +1818,12 @@ class ModeSource(StatefulIngestionSourceBase):
1730
1818
  yield from self.emit_dashboard_mces()
1731
1819
  yield from self.emit_dataset_mces()
1732
1820
  yield from self.emit_chart_mces()
1821
+ cache_info = self._get_request_json.cache_info()
1822
+ self.report.get_cache_hits = cache_info.hits
1823
+ self.report.get_cache_misses = cache_info.misses
1824
+ self.report.get_cache_size = cache_info.currsize
1825
+ memory_used = self._get_process_memory()
1826
+ self.report.process_memory_used_mb = round(memory_used["rss"], 2)
1733
1827
 
1734
1828
  def get_report(self) -> SourceReport:
1735
1829
  return self.report
@@ -13,6 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
13
 
14
14
  import datahub.metadata.schema_classes as models
15
15
  from datahub.configuration.common import AllowDenyPattern
16
+ from datahub.configuration.pattern_utils import UUID_REGEX
16
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
18
  from datahub.ingestion.api.common import PipelineContext
18
19
  from datahub.ingestion.api.decorators import (
@@ -60,6 +61,15 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
60
61
  register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
61
62
  register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
62
63
 
64
+ # Patterns copied from Snowflake source
65
+ DEFAULT_TEMP_TABLES_PATTERNS = [
66
+ r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
67
+ r".*__DBT_TMP$", # dbt
68
+ rf".*\.SEGMENT_{UUID_REGEX}", # segment
69
+ rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
70
+ r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
71
+ ]
72
+
63
73
 
64
74
  class SQLServerConfig(BasicSQLAlchemyConfig):
65
75
  # defaults
@@ -114,6 +124,12 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
114
124
  default=False,
115
125
  description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
116
126
  )
127
+ temporary_tables_pattern: List[str] = Field(
128
+ default=DEFAULT_TEMP_TABLES_PATTERNS,
129
+ description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
130
+ "match the entire table name in database.schema.table format. Defaults are to set in such a way "
131
+ "to ignore the temporary staging tables created by known ETL tools.",
132
+ )
117
133
 
118
134
  @pydantic.validator("uri_args")
119
135
  def passwords_match(cls, v, values, **kwargs):
@@ -179,6 +195,14 @@ class SQLServerSource(SQLAlchemySource):
179
195
  self.table_descriptions: Dict[str, str] = {}
180
196
  self.column_descriptions: Dict[str, str] = {}
181
197
  self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
198
+
199
+ self.report = SQLSourceReport()
200
+ if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
201
+ self.report.warning(
202
+ title="Potential issue with lineage",
203
+ message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
204
+ )
205
+
182
206
  if self.config.include_descriptions:
183
207
  for inspector in self.get_inspectors():
184
208
  db_name: str = self.get_db_name(inspector)
@@ -774,6 +798,13 @@ class SQLServerSource(SQLAlchemySource):
774
798
  )
775
799
 
776
800
  def is_temp_table(self, name: str) -> bool:
801
+ if any(
802
+ re.match(pattern, name, flags=re.IGNORECASE)
803
+ for pattern in self.config.temporary_tables_pattern
804
+ ):
805
+ logger.debug(f"temp table matched by pattern {name}")
806
+ return True
807
+
777
808
  try:
778
809
  parts = name.split(".")
779
810
  table_name = parts[-1]