acryl-datahub 1.0.0.4rc7__py3-none-any.whl → 1.0.0.4rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,5 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
+ import os
3
4
  import re
4
5
  import time
5
6
  from dataclasses import dataclass
@@ -9,6 +10,7 @@ from json import JSONDecodeError
9
10
  from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
11
 
11
12
  import dateutil.parser as dp
13
+ import psutil
12
14
  import pydantic
13
15
  import requests
14
16
  import sqlglot
@@ -114,8 +116,12 @@ from datahub.sql_parsing.sqlglot_lineage import (
114
116
  )
115
117
  from datahub.utilities import config_clean
116
118
  from datahub.utilities.lossy_collections import LossyList
119
+ from datahub.utilities.perf_timer import PerfTimer
117
120
 
118
121
  logger: logging.Logger = logging.getLogger(__name__)
122
+ # Default API limit for items returned per API call
123
+ # Used for the default per_page value for paginated API requests
124
+ DEFAULT_API_ITEMS_PER_PAGE = 30
119
125
 
120
126
 
121
127
  class SpaceKey(ContainerKey):
@@ -194,10 +200,25 @@ class ModeConfig(
194
200
  default=True, description="Tag measures and dimensions in the schema"
195
201
  )
196
202
 
203
+ items_per_page: int = Field(
204
+ default=DEFAULT_API_ITEMS_PER_PAGE,
205
+ description="Number of items per page for paginated API requests.",
206
+ hidden_from_docs=True,
207
+ )
208
+
197
209
  @validator("connect_uri")
198
210
  def remove_trailing_slash(cls, v):
199
211
  return config_clean.remove_trailing_slashes(v)
200
212
 
213
+ @validator("items_per_page")
214
+ def validate_items_per_page(cls, v):
215
+ if 1 <= v <= DEFAULT_API_ITEMS_PER_PAGE:
216
+ return v
217
+ else:
218
+ raise ValueError(
219
+ f"items_per_page must be between 1 and {DEFAULT_API_ITEMS_PER_PAGE}"
220
+ )
221
+
201
222
 
202
223
  class HTTPError429(HTTPError):
203
224
  pass
@@ -224,6 +245,20 @@ class ModeSourceReport(StaleEntityRemovalSourceReport):
224
245
  num_requests_exceeding_rate_limit: int = 0
225
246
  num_requests_retried_on_timeout: int = 0
226
247
  num_spaces_retrieved: int = 0
248
+ space_get_api_called: int = 0
249
+ report_get_api_called: int = 0
250
+ dataset_get_api_called: int = 0
251
+ query_get_api_called: int = 0
252
+ chart_get_api_called: int = 0
253
+ get_cache_hits: int = 0
254
+ get_cache_misses: int = 0
255
+ get_cache_size: int = 0
256
+ process_memory_used_mb: float = 0
257
+ space_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
258
+ report_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
259
+ dataset_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
260
+ query_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
261
+ chart_get_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
227
262
 
228
263
  def report_dropped_space(self, ent_name: str) -> None:
229
264
  self.filtered_spaces.append(ent_name)
@@ -583,34 +618,38 @@ class ModeSource(StatefulIngestionSourceBase):
583
618
  space_info = {}
584
619
  try:
585
620
  logger.debug(f"Retrieving spaces for {self.workspace_uri}")
586
- for spaces_page in self._get_paged_request_json(
587
- f"{self.workspace_uri}/spaces?filter=all", "spaces", 30
588
- ):
589
- logger.debug(
590
- f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
591
- )
592
- self.report.num_spaces_retrieved += len(spaces_page)
593
- for s in spaces_page:
594
- logger.debug(f"Space: {s.get('name')}")
595
- space_name = s.get("name", "")
596
- # Using both restricted and default_access_level because
597
- # there is a current bug with restricted returning False everytime
598
- # which has been reported to Mode team
599
- if self.config.exclude_restricted and (
600
- s.get("restricted")
601
- or s.get("default_access_level") == "restricted"
602
- ):
603
- logging.debug(
604
- f"Skipping space {space_name} due to exclude restricted"
605
- )
606
- continue
607
- if not self.config.space_pattern.allowed(space_name):
608
- self.report.report_dropped_space(space_name)
609
- logging.debug(
610
- f"Skipping space {space_name} due to space pattern"
611
- )
612
- continue
613
- space_info[s.get("token", "")] = s.get("name", "")
621
+ with self.report.space_get_timer:
622
+ for spaces_page in self._get_paged_request_json(
623
+ f"{self.workspace_uri}/spaces?filter=all",
624
+ "spaces",
625
+ self.config.items_per_page,
626
+ ):
627
+ self.report.space_get_api_called += 1
628
+ logger.debug(
629
+ f"Read {len(spaces_page)} spaces records from workspace {self.workspace_uri}"
630
+ )
631
+ self.report.num_spaces_retrieved += len(spaces_page)
632
+ for s in spaces_page:
633
+ logger.debug(f"Space: {s.get('name')}")
634
+ space_name = s.get("name", "")
635
+ # Using both restricted and default_access_level because
636
+ # there is a current bug with restricted returning False everytime
637
+ # which has been reported to Mode team
638
+ if self.config.exclude_restricted and (
639
+ s.get("restricted")
640
+ or s.get("default_access_level") == "restricted"
641
+ ):
642
+ logging.debug(
643
+ f"Skipping space {space_name} due to exclude restricted"
644
+ )
645
+ continue
646
+ if not self.config.space_pattern.allowed(space_name):
647
+ self.report.report_dropped_space(space_name)
648
+ logging.debug(
649
+ f"Skipping space {space_name} due to space pattern"
650
+ )
651
+ continue
652
+ space_info[s.get("token", "")] = s.get("name", "")
614
653
  except ModeRequestError as e:
615
654
  self.report.report_failure(
616
655
  title="Failed to Retrieve Spaces",
@@ -1414,48 +1453,61 @@ class ModeSource(StatefulIngestionSourceBase):
1414
1453
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
1415
1454
  yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
1416
1455
 
1417
- @lru_cache(maxsize=None)
1418
- def _get_reports(self, space_token: str) -> List[dict]:
1419
- reports = []
1456
+ def _get_reports(self, space_token: str) -> Iterator[List[dict]]:
1420
1457
  try:
1421
- reports_json = self._get_request_json(
1422
- f"{self.workspace_uri}/spaces/{space_token}/reports"
1423
- )
1424
- reports = reports_json.get("_embedded", {}).get("reports", {})
1458
+ with self.report.report_get_timer:
1459
+ for reports_page in self._get_paged_request_json(
1460
+ f"{self.workspace_uri}/spaces/{space_token}/reports?filter=all",
1461
+ "reports",
1462
+ self.config.items_per_page,
1463
+ ):
1464
+ self.report.report_get_api_called += 1
1465
+ logger.debug(
1466
+ f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
1467
+ )
1468
+ yield reports_page
1425
1469
  except ModeRequestError as e:
1426
1470
  self.report.report_failure(
1427
1471
  title="Failed to Retrieve Reports for Space",
1428
1472
  message="Unable to retrieve reports for space token.",
1429
1473
  context=f"Space Token: {space_token}, Error: {str(e)}",
1430
1474
  )
1431
- return reports
1432
1475
 
1433
- @lru_cache(maxsize=None)
1434
- def _get_datasets(self, space_token: str) -> List[dict]:
1476
+ def _get_datasets(self, space_token: str) -> Iterator[List[dict]]:
1435
1477
  """
1436
1478
  Retrieves datasets for a given space token.
1437
1479
  """
1438
- datasets = []
1439
1480
  try:
1440
- url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
1441
- datasets_json = self._get_request_json(url)
1442
- datasets = datasets_json.get("_embedded", {}).get("reports", [])
1481
+ with self.report.dataset_get_timer:
1482
+ for dataset_page in self._get_paged_request_json(
1483
+ f"{self.workspace_uri}/spaces/{space_token}/datasets?filter=all",
1484
+ "reports",
1485
+ self.config.items_per_page,
1486
+ ):
1487
+ self.report.dataset_get_api_called += 1
1488
+ logger.debug(
1489
+ f"Read {len(dataset_page)} datasets records from workspace {self.workspace_uri} space {space_token}"
1490
+ )
1491
+ yield dataset_page
1443
1492
  except ModeRequestError as e:
1444
1493
  self.report.report_failure(
1445
1494
  title="Failed to Retrieve Datasets for Space",
1446
1495
  message=f"Unable to retrieve datasets for space token {space_token}.",
1447
1496
  context=f"Error: {str(e)}",
1448
1497
  )
1449
- return datasets
1450
1498
 
1451
- @lru_cache(maxsize=None)
1452
- def _get_queries(self, report_token: str) -> list:
1453
- queries = []
1499
+ def _get_queries(self, report_token: str) -> List[dict]:
1454
1500
  try:
1455
- queries_json = self._get_request_json(
1456
- f"{self.workspace_uri}/reports/{report_token}/queries"
1457
- )
1458
- queries = queries_json.get("_embedded", {}).get("queries", {})
1501
+ with self.report.query_get_timer:
1502
+ # This endpoint does not handle pagination properly
1503
+ queries = self._get_request_json(
1504
+ f"{self.workspace_uri}/reports/{report_token}/queries"
1505
+ )
1506
+ self.report.query_get_api_called += 1
1507
+ logger.debug(
1508
+ f"Read {len(queries)} queries records from workspace {self.workspace_uri} report {report_token}"
1509
+ )
1510
+ return queries.get("_embedded", {}).get("queries", [])
1459
1511
  except ModeRequestError as e:
1460
1512
  if isinstance(e, HTTPError) and e.response.status_code == 404:
1461
1513
  self.report.report_warning(
@@ -1469,35 +1521,39 @@ class ModeSource(StatefulIngestionSourceBase):
1469
1521
  message="Unable to retrieve queries for report token.",
1470
1522
  context=f"Report Token: {report_token}, Error: {str(e)}",
1471
1523
  )
1472
- return queries
1524
+ return []
1473
1525
 
1474
1526
  @lru_cache(maxsize=None)
1475
- def _get_last_query_run(
1476
- self, report_token: str, report_run_id: str, query_run_id: str
1477
- ) -> Dict:
1527
+ def _get_last_query_run(self, report_token: str, report_run_id: str) -> list:
1528
+ # This function is unused and may be subject to removal in a future revision of this source
1529
+ query_runs = []
1478
1530
  try:
1479
- queries_json = self._get_request_json(
1480
- f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
1481
- )
1482
- queries = queries_json.get("_embedded", {}).get("queries", {})
1531
+ for query_run_page in self._get_paged_request_json(
1532
+ f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs?filter=all",
1533
+ "query_runs",
1534
+ self.config.items_per_page,
1535
+ ):
1536
+ query_runs.extend(query_run_page)
1483
1537
  except ModeRequestError as e:
1484
1538
  self.report.report_failure(
1485
1539
  title="Failed to Retrieve Queries for Report",
1486
1540
  message="Unable to retrieve queries for report token.",
1487
1541
  context=f"Report Token:{report_token}, Error: {str(e)}",
1488
1542
  )
1489
- return {}
1490
- return queries
1543
+ return query_runs
1491
1544
 
1492
- @lru_cache(maxsize=None)
1493
- def _get_charts(self, report_token: str, query_token: str) -> list:
1494
- charts = []
1545
+ def _get_charts(self, report_token: str, query_token: str) -> List[dict]:
1495
1546
  try:
1496
- charts_json = self._get_request_json(
1497
- f"{self.workspace_uri}/reports/{report_token}"
1498
- f"/queries/{query_token}/charts"
1499
- )
1500
- charts = charts_json.get("_embedded", {}).get("charts", {})
1547
+ with self.report.chart_get_timer:
1548
+ # This endpoint does not handle pagination properly
1549
+ charts = self._get_request_json(
1550
+ f"{self.workspace_uri}/reports/{report_token}/queries/{query_token}/charts"
1551
+ )
1552
+ self.report.chart_get_api_called += 1
1553
+ logger.debug(
1554
+ f"Read {len(charts)} charts records from workspace {self.workspace_uri} report {report_token} query {query_token}"
1555
+ )
1556
+ return charts.get("_embedded", {}).get("charts", [])
1501
1557
  except ModeRequestError as e:
1502
1558
  self.report.report_failure(
1503
1559
  title="Failed to Retrieve Charts",
@@ -1506,7 +1562,7 @@ class ModeSource(StatefulIngestionSourceBase):
1506
1562
  f"Query token: {query_token}, "
1507
1563
  f"Error: {str(e)}",
1508
1564
  )
1509
- return charts
1565
+ return []
1510
1566
 
1511
1567
  def _get_paged_request_json(
1512
1568
  self, url: str, key: str, per_page: int
@@ -1521,6 +1577,7 @@ class ModeSource(StatefulIngestionSourceBase):
1521
1577
  yield data
1522
1578
  page += 1
1523
1579
 
1580
+ @lru_cache(maxsize=20480)
1524
1581
  def _get_request_json(self, url: str) -> Dict:
1525
1582
  r = tenacity.Retrying(
1526
1583
  wait=wait_exponential(
@@ -1568,6 +1625,17 @@ class ModeSource(StatefulIngestionSourceBase):
1568
1625
 
1569
1626
  return get_request()
1570
1627
 
1628
+ @staticmethod
1629
+ def _get_process_memory():
1630
+ process = psutil.Process(os.getpid())
1631
+ mem_info = process.memory_info()
1632
+ return {
1633
+ "rss": mem_info.rss / (1024 * 1024),
1634
+ "vms": mem_info.vms / (1024 * 1024),
1635
+ "shared": getattr(mem_info, "shared", 0) / (1024 * 1024),
1636
+ "data": getattr(mem_info, "data", 0) / (1024 * 1024),
1637
+ }
1638
+
1571
1639
  @staticmethod
1572
1640
  def create_embed_aspect_mcp(
1573
1641
  entity_urn: str, embed_url: str
@@ -1603,115 +1671,116 @@ class ModeSource(StatefulIngestionSourceBase):
1603
1671
  yield from self.construct_space_container(space_token, space_name)
1604
1672
  space_container_key = self.gen_space_key(space_token)
1605
1673
 
1606
- reports = self._get_reports(space_token)
1607
- for report in reports:
1608
- logger.debug(
1609
- f"Report: name: {report.get('name')} token: {report.get('token')}"
1610
- )
1611
- dashboard_tuple_from_report = self.construct_dashboard(
1612
- space_token=space_token, report_info=report
1613
- )
1614
-
1615
- if dashboard_tuple_from_report is None:
1616
- continue
1617
- (
1618
- dashboard_snapshot_from_report,
1619
- browse_mcpw,
1620
- ) = dashboard_tuple_from_report
1674
+ for report_page in self._get_reports(space_token):
1675
+ for report in report_page:
1676
+ logger.debug(
1677
+ f"Report: name: {report.get('name')} token: {report.get('token')}"
1678
+ )
1679
+ dashboard_tuple_from_report = self.construct_dashboard(
1680
+ space_token=space_token, report_info=report
1681
+ )
1621
1682
 
1622
- mce = MetadataChangeEvent(
1623
- proposedSnapshot=dashboard_snapshot_from_report
1624
- )
1683
+ if dashboard_tuple_from_report is None:
1684
+ continue
1685
+ (
1686
+ dashboard_snapshot_from_report,
1687
+ browse_mcpw,
1688
+ ) = dashboard_tuple_from_report
1625
1689
 
1626
- mcpw = MetadataChangeProposalWrapper(
1627
- entityUrn=dashboard_snapshot_from_report.urn,
1628
- aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1629
- )
1630
- yield mcpw.as_workunit()
1631
- yield from add_dataset_to_container(
1632
- container_key=space_container_key,
1633
- dataset_urn=dashboard_snapshot_from_report.urn,
1634
- )
1635
- yield browse_mcpw.as_workunit()
1690
+ mce = MetadataChangeEvent(
1691
+ proposedSnapshot=dashboard_snapshot_from_report
1692
+ )
1636
1693
 
1637
- usage_statistics = DashboardUsageStatisticsClass(
1638
- timestampMillis=round(datetime.now().timestamp() * 1000),
1639
- viewsCount=report.get("view_count", 0),
1640
- )
1694
+ mcpw = MetadataChangeProposalWrapper(
1695
+ entityUrn=dashboard_snapshot_from_report.urn,
1696
+ aspect=SubTypesClass(typeNames=[BIAssetSubTypes.MODE_REPORT]),
1697
+ )
1698
+ yield mcpw.as_workunit()
1699
+ yield from add_dataset_to_container(
1700
+ container_key=space_container_key,
1701
+ dataset_urn=dashboard_snapshot_from_report.urn,
1702
+ )
1703
+ yield browse_mcpw.as_workunit()
1641
1704
 
1642
- yield MetadataChangeProposalWrapper(
1643
- entityUrn=dashboard_snapshot_from_report.urn,
1644
- aspect=usage_statistics,
1645
- ).as_workunit()
1705
+ usage_statistics = DashboardUsageStatisticsClass(
1706
+ timestampMillis=round(datetime.now().timestamp() * 1000),
1707
+ viewsCount=report.get("view_count", 0),
1708
+ )
1646
1709
 
1647
- if self.config.ingest_embed_url is True:
1648
- yield self.create_embed_aspect_mcp(
1649
- entity_urn=dashboard_snapshot_from_report.urn,
1650
- embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1710
+ yield MetadataChangeProposalWrapper(
1711
+ entityUrn=dashboard_snapshot_from_report.urn,
1712
+ aspect=usage_statistics,
1651
1713
  ).as_workunit()
1652
1714
 
1653
- yield MetadataWorkUnit(id=dashboard_snapshot_from_report.urn, mce=mce)
1715
+ if self.config.ingest_embed_url is True:
1716
+ yield self.create_embed_aspect_mcp(
1717
+ entity_urn=dashboard_snapshot_from_report.urn,
1718
+ embed_url=f"{self.config.connect_uri}/{self.config.workspace}/reports/{report.get('token')}/embed",
1719
+ ).as_workunit()
1720
+
1721
+ yield MetadataWorkUnit(
1722
+ id=dashboard_snapshot_from_report.urn, mce=mce
1723
+ )
1654
1724
 
1655
1725
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1656
1726
  # Space/collection -> report -> query -> Chart
1657
1727
  for space_token in self.space_tokens:
1658
- reports = self._get_reports(space_token)
1659
- for report in reports:
1660
- report_token = report.get("token", "")
1661
-
1662
- queries = self._get_queries(report_token)
1663
- for query in queries:
1664
- query_mcps = self.construct_query_or_dataset(
1665
- report_token,
1666
- query,
1667
- space_token=space_token,
1668
- report_info=report,
1669
- is_mode_dataset=False,
1670
- )
1671
- chart_fields: Dict[str, SchemaFieldClass] = {}
1672
- for wu in query_mcps:
1673
- if isinstance(
1674
- wu.metadata, MetadataChangeProposalWrapper
1675
- ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1676
- schema_metadata = wu.metadata.aspect
1677
- for field in schema_metadata.fields:
1678
- chart_fields.setdefault(field.fieldPath, field)
1679
-
1680
- yield wu
1681
-
1682
- charts = self._get_charts(report_token, query.get("token", ""))
1683
- # build charts
1684
- for i, chart in enumerate(charts):
1685
- yield from self.construct_chart_from_api_data(
1686
- i,
1687
- chart,
1688
- chart_fields,
1728
+ for report_page in self._get_reports(space_token):
1729
+ for report in report_page:
1730
+ report_token = report.get("token", "")
1731
+
1732
+ queries = self._get_queries(report_token)
1733
+ for query in queries:
1734
+ query_mcps = self.construct_query_or_dataset(
1735
+ report_token,
1689
1736
  query,
1690
1737
  space_token=space_token,
1691
1738
  report_info=report,
1692
- query_name=query["name"],
1739
+ is_mode_dataset=False,
1693
1740
  )
1741
+ chart_fields: Dict[str, SchemaFieldClass] = {}
1742
+ for wu in query_mcps:
1743
+ if isinstance(
1744
+ wu.metadata, MetadataChangeProposalWrapper
1745
+ ) and isinstance(wu.metadata.aspect, SchemaMetadataClass):
1746
+ schema_metadata = wu.metadata.aspect
1747
+ for field in schema_metadata.fields:
1748
+ chart_fields.setdefault(field.fieldPath, field)
1749
+
1750
+ yield wu
1751
+
1752
+ charts = self._get_charts(report_token, query.get("token", ""))
1753
+ # build charts
1754
+ for i, chart in enumerate(charts):
1755
+ yield from self.construct_chart_from_api_data(
1756
+ i,
1757
+ chart,
1758
+ chart_fields,
1759
+ query,
1760
+ space_token=space_token,
1761
+ report_info=report,
1762
+ query_name=query["name"],
1763
+ )
1694
1764
 
1695
1765
  def emit_dataset_mces(self):
1696
1766
  """
1697
1767
  Emits MetadataChangeEvents (MCEs) for datasets within each space.
1698
1768
  """
1699
1769
  for space_token, _ in self.space_tokens.items():
1700
- datasets = self._get_datasets(space_token)
1701
-
1702
- for report in datasets:
1703
- report_token = report.get("token", "")
1704
- queries = self._get_queries(report_token)
1705
- for query in queries:
1706
- query_mcps = self.construct_query_or_dataset(
1707
- report_token,
1708
- query,
1709
- space_token=space_token,
1710
- report_info=report,
1711
- is_mode_dataset=True,
1712
- )
1713
- for wu in query_mcps:
1714
- yield wu
1770
+ for dataset_page in self._get_datasets(space_token):
1771
+ for report in dataset_page:
1772
+ report_token = report.get("token", "")
1773
+ queries = self._get_queries(report_token)
1774
+ for query in queries:
1775
+ query_mcps = self.construct_query_or_dataset(
1776
+ report_token,
1777
+ query,
1778
+ space_token=space_token,
1779
+ report_info=report,
1780
+ is_mode_dataset=True,
1781
+ )
1782
+ for wu in query_mcps:
1783
+ yield wu
1715
1784
 
1716
1785
  @classmethod
1717
1786
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "ModeSource":
@@ -1730,6 +1799,12 @@ class ModeSource(StatefulIngestionSourceBase):
1730
1799
  yield from self.emit_dashboard_mces()
1731
1800
  yield from self.emit_dataset_mces()
1732
1801
  yield from self.emit_chart_mces()
1802
+ cache_info = self._get_request_json.cache_info()
1803
+ self.report.get_cache_hits = cache_info.hits
1804
+ self.report.get_cache_misses = cache_info.misses
1805
+ self.report.get_cache_size = cache_info.currsize
1806
+ memory_used = self._get_process_memory()
1807
+ self.report.process_memory_used_mb = round(memory_used["rss"], 2)
1733
1808
 
1734
1809
  def get_report(self) -> SourceReport:
1735
1810
  return self.report
@@ -13,6 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
13
 
14
14
  import datahub.metadata.schema_classes as models
15
15
  from datahub.configuration.common import AllowDenyPattern
16
+ from datahub.configuration.pattern_utils import UUID_REGEX
16
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
18
  from datahub.ingestion.api.common import PipelineContext
18
19
  from datahub.ingestion.api.decorators import (
@@ -60,6 +61,15 @@ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClas
60
61
  register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
61
62
  register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
62
63
 
64
+ # Patterns copied from Snowflake source
65
+ DEFAULT_TEMP_TABLES_PATTERNS = [
66
+ r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
67
+ r".*__DBT_TMP$", # dbt
68
+ rf".*\.SEGMENT_{UUID_REGEX}", # segment
69
+ rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
70
+ r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations
71
+ ]
72
+
63
73
 
64
74
  class SQLServerConfig(BasicSQLAlchemyConfig):
65
75
  # defaults
@@ -114,6 +124,12 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
114
124
  default=False,
115
125
  description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
116
126
  )
127
+ temporary_tables_pattern: List[str] = Field(
128
+ default=DEFAULT_TEMP_TABLES_PATTERNS,
129
+ description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to "
130
+ "match the entire table name in database.schema.table format. Defaults are to set in such a way "
131
+ "to ignore the temporary staging tables created by known ETL tools.",
132
+ )
117
133
 
118
134
  @pydantic.validator("uri_args")
119
135
  def passwords_match(cls, v, values, **kwargs):
@@ -179,6 +195,14 @@ class SQLServerSource(SQLAlchemySource):
179
195
  self.table_descriptions: Dict[str, str] = {}
180
196
  self.column_descriptions: Dict[str, str] = {}
181
197
  self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
198
+
199
+ self.report = SQLSourceReport()
200
+ if self.config.include_lineage and not self.config.convert_urns_to_lowercase:
201
+ self.report.warning(
202
+ title="Potential issue with lineage",
203
+ message="Lineage may not resolve accurately because 'convert_urns_to_lowercase' is False. To ensure lineage correct, set 'convert_urns_to_lowercase' to True.",
204
+ )
205
+
182
206
  if self.config.include_descriptions:
183
207
  for inspector in self.get_inspectors():
184
208
  db_name: str = self.get_db_name(inspector)
@@ -774,6 +798,13 @@ class SQLServerSource(SQLAlchemySource):
774
798
  )
775
799
 
776
800
  def is_temp_table(self, name: str) -> bool:
801
+ if any(
802
+ re.match(pattern, name, flags=re.IGNORECASE)
803
+ for pattern in self.config.temporary_tables_pattern
804
+ ):
805
+ logger.debug(f"temp table matched by pattern {name}")
806
+ return True
807
+
777
808
  try:
778
809
  parts = name.split(".")
779
810
  table_name = parts[-1]