PyPI - castor-extractor - Versions diffs - 0.16.1__py3-none-any.whl → 0.16.4__py3-none-any.whl - Mend

castor-extractor 0.16.1py3-none-any.whl → 0.16.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (41) hide show

castor_extractor/visualization/salesforce_reporting/client/__init__.py CHANGED Viewed

@@ -1,2 +1 @@
-from .credentials import SalesforceCredentials
-from .rest import SalesforceClient
+from .rest import SalesforceReportingClient

castor_extractor/visualization/salesforce_reporting/client/rest.py CHANGED Viewed

@@ -1,13 +1,8 @@
 import logging
-import os
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
-import requests
-from requests import Response
+from typing import Dict, Iterator, List, Optional
+from ....utils.salesforce import SalesforceBaseClient
 from ..assets import SalesforceReportingAsset
-from .constants import DEFAULT_API_VERSION, DEFAULT_PAGINATION_LIMIT
-from .credentials import SalesforceCredentials
 from .soql import queries
 logger = logging.getLogger(__name__)
@@ -19,89 +14,11 @@ REQUIRING_URL_ASSETS = (
 )
-class SalesforceClient:
+class SalesforceReportingClient(SalesforceBaseClient):
     """
-    Salesforce API client.
-    https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/intro_rest.htm
+    Salesforce Reporting API client
     """
-    api_version = DEFAULT_API_VERSION
-    pagination_limit = DEFAULT_PAGINATION_LIMIT
-    def __init__(
-        self,
-        credentials: SalesforceCredentials,
-        instance_url: str,
-    ):
-        self.credentials = credentials
-        self.instance_url = instance_url
-        self._token = self._access_token()
-    def _access_token(self) -> Tuple[str, str]:
-        url = f"{self.instance_url}/services/oauth2/token"
-        response = self._call(
-            url, "POST", data=self.credentials.token_request_payload()
-        )
-        return response["access_token"]
-    def _header(self) -> Dict:
-        return {"Authorization": f"Bearer {self._token}"}
-    @staticmethod
-    def _call(
-        url: str,
-        method: str = "GET",
-        *,
-        header: Optional[Dict] = None,
-        params: Optional[Dict] = None,
-        data: Optional[Dict] = None,
-        processor: Optional[Callable] = None,
-    ) -> Any:
-        logger.debug(f"Calling {method} on {url}")
-        result = requests.request(
-            method,
-            url,
-            headers=header,
-            params=params,
-            data=data,
-        )
-        result.raise_for_status()
-        if processor:
-            return processor(result)
-        return result.json()
-    @staticmethod
-    def _query_processor(response: Response) -> Tuple[dict, Optional[str]]:
-        results = response.json()
-        return results["records"], results.get("nextRecordsUrl")
-    def _query_all(self, query: str) -> Iterator[Dict]:
-        """
-        Run a SOQL query over salesforce API.
-        more: https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_query.htm
-        """
-        url = f"{self.instance_url}/services/data/v{self.api_version}/query"
-        records, next_page = self._call(
-            url,
-            params={"q": query},
-            processor=self._query_processor,
-            header=self._header(),
-        )
-        yield from records
-        page_count = 0
-        while next_page and page_count <= self.pagination_limit:
-            logger.info(f"querying page {page_count}")
-            url = f"{self.instance_url}{next_page}"
-            records, next_page = self._call(
-                url, processor=self._query_processor, header=self._header()
-            )
-            yield from records
-            page_count += 1
     def _get_asset_url(
         self, asset_type: SalesforceReportingAsset, asset: dict
     ) -> Optional[str]:
@@ -111,15 +28,15 @@ class SalesforceClient:
         if asset_type == SalesforceReportingAsset.DASHBOARDS:
             path = f"lightning/r/Dashboard/{asset['Id']}/view"
-            return os.path.join(self.instance_url, path)
+            return self.build_url(self._host, path)
         if asset_type == SalesforceReportingAsset.FOLDERS:
             path = asset["attributes"]["url"].lstrip("/")
-            return os.path.join(self.instance_url, path)
+            return self.build_url(self._host, path)
         if asset_type == SalesforceReportingAsset.REPORTS:
             path = f"lightning/r/Report/{asset['Id']}/view"
-            return os.path.join(self.instance_url, path)
+            return self.build_url(self._host, path)
         return None

castor_extractor/visualization/salesforce_reporting/extract.py CHANGED Viewed

@@ -10,14 +10,15 @@ from ...utils import (
     write_json,
     write_summary,
 )
+from ...utils.salesforce import SalesforceCredentials
 from .assets import SalesforceReportingAsset
-from .client import SalesforceClient, SalesforceCredentials
+from .client import SalesforceReportingClient
 logger = logging.getLogger(__name__)
 def iterate_all_data(
-    client: SalesforceClient,
+    client: SalesforceReportingClient,
 ) -> Iterable[Tuple[str, Union[list, dict]]]:
     """Iterate over the extracted data from Salesforce"""
@@ -30,10 +31,10 @@ def iterate_all_data(
 def extract_all(
     username: str,
     password: str,
-    consumer_key: str,
-    consumer_secret: str,
+    client_id: str,
+    client_secret: str,
     security_token: str,
-    instance_url: str,
+    base_url: str,
     output_directory: Optional[str] = None,
 ) -> None:
     """
@@ -44,11 +45,12 @@ def extract_all(
     creds = SalesforceCredentials(
         username=username,
         password=password,
-        consumer_key=consumer_key,
-        consumer_secret=consumer_secret,
+        client_id=client_id,
+        client_secret=client_secret,
         security_token=security_token,
+        base_url=base_url,
     )
-    client = SalesforceClient(credentials=creds, instance_url=instance_url)
+    client = SalesforceReportingClient(credentials=creds)
     ts = current_timestamp()
     for key, data in iterate_all_data(client):

castor_extractor/visualization/tableau/assets.py CHANGED Viewed

@@ -12,10 +12,12 @@ class TableauAsset(ExternalAsset):
     CUSTOM_SQL_TABLE = "custom_sql_tables"
     CUSTOM_SQL_QUERY = "custom_sql_queries"
     DASHBOARD = "dashboards"
+    DASHBOARD_SHEET = "dashboards_sheets"
     DATASOURCE = "datasources"
     FIELD = "fields"
     PROJECT = "projects"
     PUBLISHED_DATASOURCE = "published_datasources"
+    SHEET = "sheets"
     USAGE = "views"
     USER = "users"
     WORKBOOK = "workbooks"
@@ -25,7 +27,9 @@ class TableauAsset(ExternalAsset):
     def optional(cls) -> Set["TableauAsset"]:
         return {
             TableauAsset.DASHBOARD,
+            TableauAsset.DASHBOARD_SHEET,
             TableauAsset.FIELD,
+            TableauAsset.SHEET,
             TableauAsset.PUBLISHED_DATASOURCE,
         }
@@ -42,4 +46,5 @@ class TableauGraphqlAsset(Enum):
     DASHBOARD = "dashboards"
     DATASOURCE = "datasources"
     GROUP_FIELD = "groupFields"
+    SHEETS = "sheets"
     WORKBOOK_TO_DATASOURCE = "workbooks"

castor_extractor/visualization/tableau/client/client.py CHANGED Viewed

@@ -173,6 +173,13 @@ class ApiClient:
             TableauAsset.DASHBOARD,
         )
+    def _fetch_sheets(self) -> SerializedAsset:
+        """Fetches sheets"""
+        return self._fetch_paginated_objects(
+            TableauAsset.SHEET,
+        )
     def _fetch_paginated_objects(self, asset: TableauAsset) -> SerializedAsset:
         """Fetches paginated objects"""
@@ -203,6 +210,9 @@ class ApiClient:
         if asset == TableauAsset.PUBLISHED_DATASOURCE:
             assets = self._fetch_published_datasources()
+        if asset == TableauAsset.SHEET:
+            assets = self._fetch_sheets()
         if asset == TableauAsset.USAGE:
             assets = self._fetch_usages(self._safe_mode)

castor_extractor/visualization/tableau/gql_fields.py CHANGED Viewed

@@ -111,15 +111,15 @@ class GQLQueryFields(Enum):
     """
     DASHBOARDS: str = """
-    id
-    name
-    path
-    tags {
-        name
-    }
-    workbook {
-       luid # to retrieve the parent
-    }
+            id
+            name
+            path
+            tags {
+                name
+            }
+            workbook {
+               luid # to retrieve the parent
+            }
     """
     DATASOURCE: str = """
@@ -160,6 +160,21 @@ class GQLQueryFields(Enum):
            role
     """
+    SHEET: str = """
+            containedInDashboards {
+                id
+            }
+            id
+            index
+            name
+            upstreamFields{
+                name
+            }
+            workbook {
+                luid
+            }
+    """
     WORKBOOK_TO_DATASOURCE: str = """
            luid
            id
@@ -219,6 +234,12 @@ QUERY_FIELDS: Dict[TableauAsset, QueryInfo] = {
             OBJECT_TYPE: TableauGraphqlAsset.GROUP_FIELD,
         },
     ],
+    TableauAsset.SHEET: [
+        {
+            FIELDS: GQLQueryFields.SHEET,
+            OBJECT_TYPE: TableauGraphqlAsset.SHEETS,
+        },
+    ],
     TableauAsset.WORKBOOK_TO_DATASOURCE: [
         {
             FIELDS: GQLQueryFields.WORKBOOK_TO_DATASOURCE,

castor_extractor/warehouse/databricks/client.py CHANGED Viewed

@@ -31,7 +31,7 @@ class DatabricksClient(APIClient):
         db_allowed: Optional[Set[str]] = None,
         db_blocked: Optional[Set[str]] = None,
     ):
-        super().__init__(credentials)
+        super().__init__(host=credentials.host, token=credentials.token)
         self._db_allowed = db_allowed
         self._db_blocked = db_blocked
         self.formatter = DatabricksFormatter()
@@ -87,15 +87,32 @@ class DatabricksClient(APIClient):
             content.get("tables", []), schema
         )
-    def tables_and_columns(self, schemas: List[dict]) -> TablesColumns:
+    @staticmethod
+    def _match_table_with_user(table: dict, user_id_by_email: dict) -> dict:
+        table_owner_email = table.get("owner_email")
+        if not table_owner_email:
+            return table
+        owner_external_id = user_id_by_email.get(table_owner_email)
+        if not owner_external_id:
+            return table
+        return {**table, "owner_external_id": owner_external_id}
+    def tables_and_columns(
+        self, schemas: List[dict], users: List[dict]
+    ) -> TablesColumns:
         """
         Get the databricks tables & columns leveraging the unity catalog API
         """
         tables: List[dict] = []
         columns: List[dict] = []
+        user_id_by_email = {user.get("email"): user.get("id") for user in users}
         for schema in schemas:
             t_to_add, c_to_add = self._tables_columns_of_schema(schema)
-            tables.extend(t_to_add)
+            t_with_owner = [
+                self._match_table_with_user(table, user_id_by_email)
+                for table in t_to_add
+            ]
+            tables.extend(t_with_owner)
             columns.extend(c_to_add)
         return tables, columns

castor_extractor/warehouse/databricks/client_test.py CHANGED Viewed

@@ -64,3 +64,17 @@ def test_DatabricksClient__keep_catalog():
     assert client._keep_catalog("staging")
     assert not client._keep_catalog("dev")
     assert not client._keep_catalog("something_unknown")
+def test_DatabricksClient__match_table_with_user():
+    client = MockDatabricksClient()
+    users_by_email = {"bob@castordoc.com": 3}
+    table = {"id": 1, "owner_email": "bob@castordoc.com"}
+    table_with_owner = client._match_table_with_user(table, users_by_email)
+    assert table_with_owner == {**table, "owner_external_id": 3}
+    table_without_owner = {"id": 1, "owner_email": None}
+    actual = client._match_table_with_user(table_without_owner, users_by_email)
+    assert actual == table_without_owner

castor_extractor/warehouse/databricks/credentials.py CHANGED Viewed

@@ -25,7 +25,4 @@ def to_credentials(params: dict) -> DatabricksCredentials:
     """extract Databricks credentials"""
     host = params.get("host") or from_env(_HOST)
     token = params.get("token") or from_env(_TOKEN)
-    return DatabricksCredentials(
-        host=host,
-        token=token,
-    )
+    return DatabricksCredentials(host=host, token=token)

castor_extractor/warehouse/databricks/extract.py CHANGED Viewed

@@ -43,7 +43,7 @@ class DatabricksExtractionProcessor:
         self._storage = storage
         self._skip_existing = skip_existing
-    def _should_not_reextract(self, asset_group) -> bool:
+    def _should_not_reextract(self, asset_group: WarehouseAssetGroup) -> bool:
         """helper function to determine whether we need to extract"""
         if not self._skip_existing:
             return False
@@ -82,7 +82,8 @@ class DatabricksExtractionProcessor:
         del databases
-        tables, columns = self._client.tables_and_columns(schemas)
+        users = self._client.users()
+        tables, columns = self._client.tables_and_columns(schemas, users)
         location = self._storage.put(WarehouseAsset.TABLE.value, tables)
         catalog_locations[WarehouseAsset.TABLE.value] = location

castor_extractor/warehouse/databricks/format.py CHANGED Viewed

@@ -19,10 +19,11 @@ def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
 def _table_payload(schema: dict, table: dict) -> dict:
     return {
+        "description": table.get("comment"),
         "id": table["table_id"],
+        "owner_email": table.get("owner"),
         "schema_id": f"{schema['id']}",
         "table_name": table["name"],
-        "description": table.get("comment"),
         "tags": [],
         "type": table.get("table_type"),
     }
@@ -30,12 +31,12 @@ def _table_payload(schema: dict, table: dict) -> dict:
 def _column_payload(table: dict, column: dict) -> dict:
     return {
-        "id": f"`{table['id']}`.`{column['name']}`",
         "column_name": column["name"],
-        "table_id": table["id"],
-        "description": column.get("comment"),
         "data_type": column["type_name"],
+        "description": column.get("comment"),
+        "id": f"`{table['id']}`.`{column['name']}`",
         "ordinal_position": column["position"],
+        "table_id": table["id"],
     }

castor_extractor/warehouse/salesforce/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .client import SalesforceClient
+from .extract import (
+    SALESFORCE_ASSETS,
+    SalesforceExtractionProcessor,
+    extract_all,
+)

castor_extractor/warehouse/salesforce/client.py ADDED Viewed

@@ -0,0 +1,112 @@
+import logging
+from typing import Dict, Iterator, List
+from tqdm import tqdm  # type: ignore
+from ...utils.salesforce import SalesforceBaseClient, SalesforceCredentials
+from .format import SalesforceFormatter
+from .soql import SOBJECT_FIELDS_QUERY_TPL, SOBJECTS_QUERY_TPL
+logger = logging.getLogger(__name__)
+class SalesforceClient(SalesforceBaseClient):
+    """
+    Salesforce API client to extract sobjects
+    """
+    # Implicit (hard-coded in Salesforce) limitation when using SOQL of 2,000 rows
+    LIMIT_RECORDS_PER_PAGE = 2000
+    def __init__(self, credentials: SalesforceCredentials):
+        super().__init__(credentials)
+        self.formatter = SalesforceFormatter()
+    @staticmethod
+    def name() -> str:
+        return "Salesforce"
+    def _format_query(self, query_template: str, start_durable_id: str) -> str:
+        return query_template.format(
+            start_durable_id=start_durable_id,
+            limit=self.LIMIT_RECORDS_PER_PAGE,
+        )
+    def _next_records(
+        self, url: str, query_template: str, start_durable_id: str = "0000"
+    ) -> List[dict]:
+        query = self._format_query(
+            query_template, start_durable_id=start_durable_id
+        )
+        records, _ = self._call(
+            url, params={"q": query}, processor=self._query_processor
+        )
+        return records
+    def _is_last_page(self, records: List[dict]) -> bool:
+        return len(records) < self.LIMIT_RECORDS_PER_PAGE
+    def _should_query_next_page(
+        self, records: List[dict], page_number: int
+    ) -> bool:
+        return not (
+            self._is_last_page(records)
+            or self._has_reached_pagination_limit(page_number)
+        )
+    def _query_all(self, query_template: str) -> Iterator[dict]:
+        """
+        Run a SOQL query over salesforce API
+        Note, pagination is performed via a LIMIT in the SOQL query and requires
+        that ids are sorted. The SOQL query must support `limit` and
+        `start_durable_id` as parameters.
+        """
+        url = self.query_url
+        logger.info("querying page 0")
+        records = self._next_records(url, query_template)
+        yield from records
+        page_count = 1
+        while self._should_query_next_page(records, page_count):
+            logger.info(f"querying page {page_count}")
+            last_durable_id = records[-1]["DurableId"]
+            records = self._next_records(
+                url, query_template, start_durable_id=last_durable_id
+            )
+            yield from records
+            page_count += 1
+    def fetch_sobjects(self) -> List[dict]:
+        """Fetch all sobjects"""
+        logger.info("Extracting sobjects")
+        return list(self._query_all(SOBJECTS_QUERY_TPL))
+    def fetch_fields(self, sobject_name: str) -> List[dict]:
+        """Fetches fields of a given sobject"""
+        query = SOBJECT_FIELDS_QUERY_TPL.format(
+            entity_definition_id=sobject_name
+        )
+        response = self._call(self.tooling_url, params={"q": query})
+        return response["records"]
+    def tables(self) -> List[dict]:
+        """
+        Get Salesforce sobjects as tables
+        """
+        sobjects = self.fetch_sobjects()
+        logger.info(f"Extracted {len(sobjects)} sobjects")
+        return self.formatter.tables(sobjects)
+    def columns(
+        self, sobject_names: List[str], show_progress: bool = True
+    ) -> List[dict]:
+        """
+        Get salesforce sobject fields as columns
+        show_progress: optionally deactivate the tqdm progress bar
+        """
+        sobject_fields: Dict[str, List[dict]] = dict()
+        for sobject_name in tqdm(sobject_names, disable=not show_progress):
+            fields = self.fetch_fields(sobject_name)
+            sobject_fields[sobject_name] = fields
+        return self.formatter.columns(sobject_fields)

castor_extractor/warehouse/salesforce/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ DATABASE_NAME = "salesforce"
2	+ SCHEMA_NAME = "schema"

castor_extractor/warehouse/salesforce/extract.py ADDED Viewed

@@ -0,0 +1,111 @@
+import logging
+from typing import Dict, List, Tuple
+from ...utils import AbstractStorage, LocalStorage, write_summary
+from ...utils.salesforce import to_credentials
+from ..abstract import (
+    SupportedAssets,
+    WarehouseAsset,
+    WarehouseAssetGroup,
+    common_args,
+)
+from .client import SalesforceClient
+logger = logging.getLogger(__name__)
+Paths = Dict[str, str]
+SALESFORCE_CATALOG_ASSETS: Tuple[WarehouseAsset, ...] = (
+    WarehouseAsset.TABLE,
+    WarehouseAsset.COLUMN,
+)
+SALESFORCE_ASSETS: SupportedAssets = {
+    WarehouseAssetGroup.CATALOG: SALESFORCE_CATALOG_ASSETS
+}
+class SalesforceExtractionProcessor:
+    """Salesforce API-based extraction management - warehouse part"""
+    def __init__(
+        self,
+        client: SalesforceClient,
+        storage: AbstractStorage,
+        skip_existing: bool = False,
+    ):
+        self._client = client
+        self._storage = storage
+        self._skip_existing = skip_existing
+    def _should_extract(self) -> bool:
+        """helper function to determine whether we need to extract"""
+        if not self._skip_existing:
+            return True
+        for asset in SALESFORCE_CATALOG_ASSETS:
+            if not self._storage.exists(asset.value):
+                return True
+        logger.info("Skipped, files for catalog already exist")
+        return False
+    def _existing_group_paths(self) -> Paths:
+        return {
+            a.value: self._storage.path(a.value)
+            for a in SALESFORCE_CATALOG_ASSETS
+        }
+    def extract_catalog(self, show_progress: bool = True) -> Paths:
+        """
+        Extract the following catalog assets: tables and columns
+        and return the locations of the extracted data
+        """
+        if not self._should_extract():
+            return self._existing_group_paths()
+        catalog_locations: Paths = dict()
+        tables = self._client.tables()
+        location = self._storage.put(WarehouseAsset.TABLE.value, tables)
+        catalog_locations[WarehouseAsset.TABLE.value] = location
+        logger.info(f"Extracted {len(tables)} tables to {location}")
+        table_names = [t["table_name"] for t in tables]
+        columns = self._client.columns(table_names, show_progress)
+        location = self._storage.put(WarehouseAsset.COLUMN.value, columns)
+        catalog_locations[WarehouseAsset.COLUMN.value] = location
+        logger.info(f"Extracted {len(columns)} columns to {location}")
+        return catalog_locations
+    def extract_role(self) -> Paths:
+        """extract no users and return the empty file location"""
+        users: List[dict] = []
+        location = self._storage.put(WarehouseAsset.USER.value, users)
+        logger.info(f"Extracted {len(users)} users to {location}")
+        return {WarehouseAsset.USER.value: location}
+def extract_all(**kwargs) -> None:
+    """
+    Extract all assets from Salesforce and store the results in CSV files
+    """
+    output_directory, skip_existing = common_args(kwargs)
+    client = SalesforceClient(credentials=to_credentials(kwargs))
+    storage = LocalStorage(directory=output_directory)
+    extractor = SalesforceExtractionProcessor(
+        client=client,
+        storage=storage,
+        skip_existing=skip_existing,
+    )
+    extractor.extract_catalog()
+    extractor.extract_role()
+    write_summary(
+        output_directory,
+        storage.stored_at_ts,
+        client_name=client.name(),
+    )

castor-extractor 0.16.1__py3-none-any.whl → 0.16.4__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.16.1py3-none-any.whl → 0.16.4py3-none-any.whl