PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc7__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc6py3-none-any.whl → 0.15.0.2rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (23) hide show

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -174,6 +174,8 @@ from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.stats_collections import TopKDict
 from datahub.utilities.urns.dataset_urn import DatasetUrn
+DEFAULT_PAGE_SIZE = 10
 try:
     # On earlier versions of the tableauserverclient, the NonXMLResponseError
     # was thrown when reauthentication was necessary. We'll keep both exceptions
@@ -342,11 +344,140 @@ class PermissionIngestionConfig(ConfigModel):
     )
+class TableauPageSizeConfig(ConfigModel):
+    """
+    Configuration for setting page sizes for different Tableau metadata objects.
+    Some considerations:
+    - All have default values, so no setting is mandatory.
+    - In general, with the `effective_` methods, if not specifically set fine-grained metrics fallback to `page_size`
+    or correlate with `page_size`.
+    Measuring the impact of changing these values can be done by looking at the
+    `num_(filter_|paginated_)?queries_by_connection_type` metrics in the report.
+    """
+    page_size: int = Field(
+        default=DEFAULT_PAGE_SIZE,
+        description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
+    )
+    database_server_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of database servers to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_database_server_page_size(self) -> int:
+        return self.database_server_page_size or self.page_size
+    # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
+    # returns warnings like this:
+    # {
+    # 	'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
+    # 	'extensions': {
+    # 		'severity': 'WARNING',
+    # 		'code': 'NODE_LIMIT_EXCEEDED',
+    # 		'properties': {
+    # 			'nodeLimit': 20000
+    # 		}
+    # 	}
+    # }
+    # Reducing the page size for the workbook queries helps to avoid this.
+    workbook_page_size: Optional[int] = Field(
+        default=1,
+        description="[advanced] Number of workbooks to query at a time using the Tableau API; defaults to `1` and fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_workbook_page_size(self) -> int:
+        return self.workbook_page_size or self.page_size
+    sheet_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of sheets to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_sheet_page_size(self) -> int:
+        return self.sheet_page_size or self.page_size
+    dashboard_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of dashboards to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_dashboard_page_size(self) -> int:
+        return self.dashboard_page_size or self.page_size
+    embedded_datasource_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of embedded datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_embedded_datasource_page_size(self) -> int:
+        return self.embedded_datasource_page_size or self.page_size
+    # Since the field upstream query was separated from the embedded datasource queries into an independent query,
+    # the number of queries increased significantly and so the execution time.
+    # To increase the batching and so reduce the number of queries, we can increase the page size for that
+    # particular case.
+    #
+    # That's why unless specifically set, we will effectively use 10 times the page size as the default page size.
+    embedded_datasource_field_upstream_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of upstream fields to query at a time for embedded datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
+    )
+    @property
+    def effective_embedded_datasource_field_upstream_page_size(self) -> int:
+        return self.embedded_datasource_field_upstream_page_size or self.page_size * 10
+    published_datasource_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of published datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_published_datasource_page_size(self) -> int:
+        return self.published_datasource_page_size or self.page_size
+    published_datasource_field_upstream_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of upstream fields to query at a time for published datasources using the Tableau API; fallbacks to `page_size` * 10 if not set.",
+    )
+    @property
+    def effective_published_datasource_field_upstream_page_size(self) -> int:
+        return self.published_datasource_field_upstream_page_size or self.page_size * 10
+    custom_sql_table_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of custom sql datasources to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_custom_sql_table_page_size(self) -> int:
+        return self.custom_sql_table_page_size or self.page_size
+    database_table_page_size: Optional[int] = Field(
+        default=None,
+        description="[advanced] Number of database tables to query at a time using the Tableau API; fallbacks to `page_size` if not set.",
+    )
+    @property
+    def effective_database_table_page_size(self) -> int:
+        return self.database_table_page_size or self.page_size
 class TableauConfig(
     DatasetLineageProviderConfigBase,
     StatefulIngestionConfigBase,
     DatasetSourceConfigMixin,
     TableauConnectionConfig,
+    TableauPageSizeConfig,
 ):
     projects: Optional[List[str]] = Field(
         default=["default"],
@@ -396,29 +527,6 @@ class TableauConfig(
         description="Ingest details for tables external to (not embedded in) tableau as entities.",
     )
-    page_size: int = Field(
-        default=10,
-        description="[advanced] Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using the Tableau API.",
-    )
-    # We've found that even with a small workbook page size (e.g. 10), the Tableau API often
-    # returns warnings like this:
-    # {
-    # 	'message': 'Showing partial results. The request exceeded the 20000 node limit. Use pagination, additional filtering, or both in the query to adjust results.',
-    # 	'extensions': {
-    # 		'severity': 'WARNING',
-    # 		'code': 'NODE_LIMIT_EXCEEDED',
-    # 		'properties': {
-    # 			'nodeLimit': 20000
-    # 		}
-    # 	}
-    # }
-    # Reducing the page size for the workbook queries helps to avoid this.
-    workbook_page_size: int = Field(
-        default=1,
-        description="[advanced] Number of workbooks to query at a time using the Tableau API.",
-    )
     env: str = Field(
         default=builder.DEFAULT_ENV,
         description="Environment to use in namespace when constructing URNs.",
@@ -700,6 +808,23 @@ class TableauSourceReport(
         default_factory=(lambda: defaultdict(int))
     )
+    # Counters for tracking the number of queries made to get_connection_objects method
+    # by connection type (static and short set of keys):
+    # - num_queries_by_connection_type: total number of queries
+    # - num_filter_queries_by_connection_type: number of paginated queries due to splitting query filters
+    # - num_paginated_queries_by_connection_type: total number of queries due to Tableau pagination
+    # These counters are useful to understand the impact of changing the page size.
+    num_queries_by_connection_type: Dict[str, int] = dataclass_field(
+        default_factory=(lambda: defaultdict(int))
+    )
+    num_filter_queries_by_connection_type: Dict[str, int] = dataclass_field(
+        default_factory=(lambda: defaultdict(int))
+    )
+    num_paginated_queries_by_connection_type: Dict[str, int] = dataclass_field(
+        default_factory=(lambda: defaultdict(int))
+    )
 def report_user_role(report: TableauSourceReport, server: Server) -> None:
     title: str = "Insufficient Permissions"
@@ -994,7 +1119,9 @@ class TableauSiteSource:
             return server_connection
         for database_server in self.get_connection_objects(
-            database_servers_graphql_query, c.DATABASE_SERVERS_CONNECTION
+            query=database_servers_graphql_query,
+            connection_type=c.DATABASE_SERVERS_CONNECTION,
+            page_size=self.config.effective_database_server_page_size,
         ):
             database_server_id = database_server.get(c.ID)
             server_connection = database_server.get(c.HOST_NAME)
@@ -1420,22 +1547,30 @@ class TableauSiteSource:
         self,
         query: str,
         connection_type: str,
+        page_size: int,
         query_filter: dict = {},
-        page_size_override: Optional[int] = None,
     ) -> Iterable[dict]:
         query_filter = optimize_query_filter(query_filter)
         # Calls the get_connection_object_page function to get the objects,
         # and automatically handles pagination.
-        page_size = page_size_override or self.config.page_size
         filter_pages = get_filter_pages(query_filter, page_size)
+        self.report.num_queries_by_connection_type[connection_type] += 1
+        self.report.num_filter_queries_by_connection_type[connection_type] += len(
+            filter_pages
+        )
         for filter_page in filter_pages:
             has_next_page = 1
             current_cursor: Optional[str] = None
             while has_next_page:
                 filter_: str = make_filter(filter_page)
+                self.report.num_paginated_queries_by_connection_type[
+                    connection_type
+                ] += 1
                 self.report.num_expected_tableau_metadata_queries += 1
                 (
                     connection_objects,
@@ -1463,10 +1598,10 @@ class TableauSiteSource:
             projects = {c.PROJECT_NAME_WITH_IN: project_names}
             for workbook in self.get_connection_objects(
-                workbook_graphql_query,
-                c.WORKBOOKS_CONNECTION,
-                projects,
-                page_size_override=self.config.workbook_page_size,
+                query=workbook_graphql_query,
+                connection_type=c.WORKBOOKS_CONNECTION,
+                query_filter=projects,
+                page_size=self.config.effective_workbook_page_size,
             ):
                 # This check is needed as we are using projectNameWithin which return project as per project name so if
                 # user want to ingest only nested project C from A->B->C then tableau might return more than one Project
@@ -1921,9 +2056,10 @@ class TableauSiteSource:
         custom_sql_connection = list(
             self.get_connection_objects(
-                custom_sql_graphql_query,
-                c.CUSTOM_SQL_TABLE_CONNECTION,
-                custom_sql_filter,
+                query=custom_sql_graphql_query,
+                connection_type=c.CUSTOM_SQL_TABLE_CONNECTION,
+                query_filter=custom_sql_filter,
+                page_size=self.config.effective_custom_sql_table_page_size,
             )
         )
@@ -2632,6 +2768,7 @@ class TableauSiteSource:
         self,
         datasource: dict,
         field_upstream_query: str,
+        page_size: int,
     ) -> dict:
         # Collect field ids to fetch field upstreams
         field_ids: List[str] = []
@@ -2642,9 +2779,10 @@ class TableauSiteSource:
         # Fetch field upstreams and arrange them in map
         field_vs_upstream: Dict[str, dict] = {}
         for field_upstream in self.get_connection_objects(
-            field_upstream_query,
-            c.FIELDS_CONNECTION,
-            {c.ID_WITH_IN: field_ids},
+            query=field_upstream_query,
+            connection_type=c.FIELDS_CONNECTION,
+            query_filter={c.ID_WITH_IN: field_ids},
+            page_size=page_size,
         ):
             if field_upstream.get(c.ID):
                 field_id = field_upstream[c.ID]
@@ -2667,13 +2805,15 @@ class TableauSiteSource:
         datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
         for datasource in self.get_connection_objects(
-            published_datasource_graphql_query,
-            c.PUBLISHED_DATA_SOURCES_CONNECTION,
-            datasource_filter,
+            query=published_datasource_graphql_query,
+            connection_type=c.PUBLISHED_DATA_SOURCES_CONNECTION,
+            query_filter=datasource_filter,
+            page_size=self.config.effective_published_datasource_page_size,
         ):
             datasource = self.update_datasource_for_field_upstream(
                 datasource=datasource,
                 field_upstream_query=datasource_upstream_fields_graphql_query,
+                page_size=self.config.effective_published_datasource_field_upstream_page_size,
             )
             yield from self.emit_datasource(datasource)
@@ -2689,11 +2829,12 @@ class TableauSiteSource:
             c.ID_WITH_IN: list(tableau_database_table_id_to_urn_map.keys())
         }
-        # Emmitting tables that came from Tableau metadata
+        # Emitting tables that came from Tableau metadata
         for tableau_table in self.get_connection_objects(
-            database_tables_graphql_query,
-            c.DATABASE_TABLES_CONNECTION,
-            tables_filter,
+            query=database_tables_graphql_query,
+            connection_type=c.DATABASE_TABLES_CONNECTION,
+            query_filter=tables_filter,
+            page_size=self.config.effective_database_table_page_size,
         ):
             database_table = self.database_tables[
                 tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
@@ -2882,9 +3023,10 @@ class TableauSiteSource:
         sheets_filter = {c.ID_WITH_IN: self.sheet_ids}
         for sheet in self.get_connection_objects(
-            sheet_graphql_query,
-            c.SHEETS_CONNECTION,
-            sheets_filter,
+            query=sheet_graphql_query,
+            connection_type=c.SHEETS_CONNECTION,
+            query_filter=sheets_filter,
+            page_size=self.config.effective_sheet_page_size,
         ):
             if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
                 yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
@@ -3202,9 +3344,10 @@ class TableauSiteSource:
         dashboards_filter = {c.ID_WITH_IN: self.dashboard_ids}
         for dashboard in self.get_connection_objects(
-            dashboard_graphql_query,
-            c.DASHBOARDS_CONNECTION,
-            dashboards_filter,
+            query=dashboard_graphql_query,
+            connection_type=c.DASHBOARDS_CONNECTION,
+            query_filter=dashboards_filter,
+            page_size=self.config.effective_dashboard_page_size,
         ):
             if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
                 yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
@@ -3349,13 +3492,15 @@ class TableauSiteSource:
         datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
         for datasource in self.get_connection_objects(
-            embedded_datasource_graphql_query,
-            c.EMBEDDED_DATA_SOURCES_CONNECTION,
-            datasource_filter,
+            query=embedded_datasource_graphql_query,
+            connection_type=c.EMBEDDED_DATA_SOURCES_CONNECTION,
+            query_filter=datasource_filter,
+            page_size=self.config.effective_embedded_datasource_page_size,
         ):
             datasource = self.update_datasource_for_field_upstream(
                 datasource=datasource,
                 field_upstream_query=datasource_upstream_fields_graphql_query,
+                page_size=self.config.effective_embedded_datasource_field_upstream_page_size,
             )
             yield from self.emit_datasource(
                 datasource,

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -642,8 +642,11 @@ class TableauUpstreamReference:
     @classmethod
     def create(
-        cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None
+        cls, d: Dict, default_schema_map: Optional[Dict[str, str]] = None
     ) -> "TableauUpstreamReference":
+        if d is None:
+            raise ValueError("TableauUpstreamReference.create: d is None")
         # Values directly from `table` object from Tableau
         database_dict = (
             d.get(c.DATABASE) or {}
@@ -717,7 +720,7 @@ class TableauUpstreamReference:
         #  schema
         # TODO: Validate the startswith check. Currently required for our integration tests
-        if full_name is None or not full_name.startswith("["):
+        if full_name is None:
             return None
         return full_name.replace("[", "").replace("]", "").split(".")

datahub/secret/datahub_secrets_client.py CHANGED Viewed

@@ -11,34 +11,25 @@ class DataHubSecretsClient:
     def __init__(self, graph: DataHubGraph):
         self.graph = graph
+    def _cleanup_secret_name(self, secret_names: List[str]) -> List[str]:
+        """Remove empty strings from the list of secret names."""
+        return [secret_name for secret_name in secret_names if secret_name]
     def get_secret_values(self, secret_names: List[str]) -> Dict[str, Optional[str]]:
         if len(secret_names) == 0:
             return {}
-        request_json = {
-            "query": """query getSecretValues($input: GetSecretValuesInput!) {\n
-                getSecretValues(input: $input) {\n
-                    name\n
-                    value\n
-                }\n
+        res_data = self.graph.execute_graphql(
+            query="""query getSecretValues($input: GetSecretValuesInput!) {
+                getSecretValues(input: $input) {
+                    name
+                    value
+                }
             }""",
-            "variables": {"input": {"secrets": secret_names}},
-        }
-        # TODO: Use graph.execute_graphql() instead.
-        # Fetch secrets using GraphQL API f
-        response = self.graph._session.post(
-            f"{self.graph.config.server}/api/graphql", json=request_json
+            variables={"input": {"secrets": self._cleanup_secret_name(secret_names)}},
         )
-        response.raise_for_status()
-        # Verify response
-        res_data = response.json()
-        if "errors" in res_data:
-            raise Exception("Failed to retrieve secrets from DataHub.")
         # Convert list of name, value secret pairs into a dict and return
-        secret_value_list = res_data["data"]["getSecretValues"]
+        secret_value_list = res_data["getSecretValues"]
         secret_value_dict = dict()
         for secret_value in secret_value_list:
             secret_value_dict[secret_value["name"]] = secret_value["value"]

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -284,6 +284,7 @@ class SqlAggregatorReport(Report):
     # Queries.
     num_queries_entities_generated: int = 0
+    num_queries_used_in_lineage: Optional[int] = None
     num_queries_skipped_due_to_filters: int = 0
     # Usage-related.
@@ -1200,6 +1201,7 @@ class SqlParsingAggregator(Closeable):
         queries_generated: Set[QueryId] = set()
         yield from self._gen_lineage_mcps(queries_generated)
+        self.report.num_queries_used_in_lineage = len(queries_generated)
         yield from self._gen_usage_statistics_mcps()
         yield from self._gen_operation_mcps(queries_generated)
         yield from self._gen_remaining_queries(queries_generated)

datahub/sql_parsing/sqlglot_lineage.py CHANGED Viewed

@@ -1,10 +1,9 @@
-from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
 import dataclasses
 import functools
 import logging
 import traceback
 from collections import defaultdict
+from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
 from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
 import pydantic.dataclasses

datahub/sql_parsing/sqlglot_utils.py CHANGED Viewed

@@ -1,9 +1,8 @@
-from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
 import functools
 import hashlib
 import logging
 import re
+from datahub.sql_parsing._sqlglot_patch import SQLGLOT_PATCHED
 from typing import Dict, Iterable, Optional, Tuple, Union
 import sqlglot

datahub/utilities/memory_footprint.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from collections import deque
 from itertools import chain
 from sys import getsizeof
-from typing import Any, Callable
+from typing import Any, Iterator
 def total_size(o: Any, handlers: Any = {}) -> int:
@@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
     Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
     """
-    dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
+    def dict_handler(d: dict) -> Iterator[Any]:
+        return chain.from_iterable(d.items())
     all_handlers = {
         tuple: iter,

datahub/utilities/urns/_urn_base.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import functools
 import urllib.parse
 from abc import abstractmethod
-from typing import ClassVar, Dict, List, Optional, Type
+from typing import ClassVar, Dict, List, Optional, Type, Union
 from deprecated import deprecated
 from typing_extensions import Self
@@ -86,12 +86,24 @@ class Urn:
         return self._entity_ids
     @classmethod
-    def from_string(cls, urn_str: str) -> Self:
-        """
-        Creates an Urn from its string representation.
+    def from_string(cls, urn_str: Union[str, "Urn"], /) -> Self:
+        """Create an Urn from its string representation.
+        When called against the base Urn class, this method will return a more specific Urn type where possible.
+        >>> from datahub.metadata.urns import DatasetUrn, Urn
+        >>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
+        >>> urn = Urn.from_string(urn_str)
+        >>> assert isinstance(urn, DatasetUrn)
+        When called against a specific Urn type (e.g. DatasetUrn.from_string), this method can
+        also be used for type narrowing.
+        >>> urn_str = 'urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)'
+        >>> assert DatasetUrn.from_string(urn_str)
         Args:
-            urn_str: The string representation of the Urn.
+            urn_str: The string representation of the urn. Also accepts an existing Urn instance.
         Returns:
             Urn of the given string representation.
@@ -100,6 +112,17 @@ class Urn:
             InvalidUrnError: If the string representation is in invalid format.
         """
+        if isinstance(urn_str, Urn):
+            if issubclass(cls, _SpecificUrn) and isinstance(urn_str, cls):
+                # Fast path - we're already the right type.
+                # I'm not really sure why we need a type ignore here, but mypy doesn't really
+                # understand the isinstance check above.
+                return urn_str  # type: ignore
+            # Fall through, so that we can convert a generic Urn to a specific Urn type.
+            urn_str = urn_str.urn()
         # TODO: Add handling for url encoded urns e.g. urn%3A ...
         if not urn_str.startswith("urn:li:"):

{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc7.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc6.dist-info → acryl_datahub-0.15.0.2rc7.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.2rc6__py3-none-any.whl → 0.15.0.2rc7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc6py3-none-any.whl → 0.15.0.2rc7py3-none-any.whl