PyPI - omniload - Versions diffs - 0.0.0.dev0__py3-none-any.whl - Mend

omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (218) hide show

omniload/conftest.py +72 -0
omniload/main.py +810 -0
omniload/src/.gitignore +10 -0
omniload/src/adjust/__init__.py +108 -0
omniload/src/adjust/adjust_helpers.py +122 -0
omniload/src/airtable/__init__.py +84 -0
omniload/src/allium/__init__.py +128 -0
omniload/src/anthropic/__init__.py +277 -0
omniload/src/anthropic/helpers.py +525 -0
omniload/src/applovin/__init__.py +316 -0
omniload/src/applovin_max/__init__.py +117 -0
omniload/src/appsflyer/__init__.py +325 -0
omniload/src/appsflyer/client.py +110 -0
omniload/src/appstore/__init__.py +142 -0
omniload/src/appstore/client.py +126 -0
omniload/src/appstore/errors.py +15 -0
omniload/src/appstore/models.py +117 -0
omniload/src/appstore/resources.py +179 -0
omniload/src/arrow/__init__.py +81 -0
omniload/src/asana_source/__init__.py +281 -0
omniload/src/asana_source/helpers.py +30 -0
omniload/src/asana_source/settings.py +158 -0
omniload/src/attio/__init__.py +102 -0
omniload/src/attio/helpers.py +65 -0
omniload/src/blob.py +95 -0
omniload/src/bruin/__init__.py +76 -0
omniload/src/chess/__init__.py +180 -0
omniload/src/chess/helpers.py +35 -0
omniload/src/chess/settings.py +18 -0
omniload/src/clickup/__init__.py +85 -0
omniload/src/clickup/helpers.py +47 -0
omniload/src/collector/spinner.py +43 -0
omniload/src/couchbase_source/__init__.py +118 -0
omniload/src/couchbase_source/helpers.py +135 -0
omniload/src/cursor/__init__.py +83 -0
omniload/src/cursor/helpers.py +188 -0
omniload/src/customer_io/__init__.py +486 -0
omniload/src/customer_io/helpers.py +530 -0
omniload/src/destinations.py +982 -0
omniload/src/docebo/__init__.py +589 -0
omniload/src/docebo/client.py +435 -0
omniload/src/docebo/helpers.py +97 -0
omniload/src/dune/__init__.py +104 -0
omniload/src/dune/helpers.py +108 -0
omniload/src/dynamodb/__init__.py +86 -0
omniload/src/elasticsearch/__init__.py +80 -0
omniload/src/elasticsearch/helpers.py +141 -0
omniload/src/errors.py +26 -0
omniload/src/facebook_ads/__init__.py +403 -0
omniload/src/facebook_ads/exceptions.py +19 -0
omniload/src/facebook_ads/helpers.py +296 -0
omniload/src/facebook_ads/settings.py +224 -0
omniload/src/facebook_ads/utils.py +53 -0
omniload/src/factory.py +305 -0
omniload/src/filesystem/__init__.py +133 -0
omniload/src/filesystem/helpers.py +114 -0
omniload/src/filesystem/readers.py +187 -0
omniload/src/filters.py +62 -0
omniload/src/fireflies/__init__.py +151 -0
omniload/src/fireflies/helpers.py +753 -0
omniload/src/fluxx/__init__.py +10013 -0
omniload/src/fluxx/helpers.py +233 -0
omniload/src/frankfurter/__init__.py +157 -0
omniload/src/frankfurter/helpers.py +48 -0
omniload/src/freshdesk/__init__.py +103 -0
omniload/src/freshdesk/freshdesk_client.py +151 -0
omniload/src/freshdesk/settings.py +23 -0
omniload/src/fundraiseup/__init__.py +95 -0
omniload/src/fundraiseup/client.py +81 -0
omniload/src/github/__init__.py +202 -0
omniload/src/github/helpers.py +207 -0
omniload/src/github/queries.py +129 -0
omniload/src/github/settings.py +24 -0
omniload/src/google_ads/__init__.py +198 -0
omniload/src/google_ads/field.py +17 -0
omniload/src/google_ads/metrics.py +254 -0
omniload/src/google_ads/predicates.py +37 -0
omniload/src/google_ads/reports.py +411 -0
omniload/src/google_ads/test_google_ads.py +184 -0
omniload/src/google_analytics/__init__.py +144 -0
omniload/src/google_analytics/helpers.py +312 -0
omniload/src/google_sheets/README.md +95 -0
omniload/src/google_sheets/__init__.py +166 -0
omniload/src/google_sheets/helpers/__init__.py +15 -0
omniload/src/google_sheets/helpers/api_calls.py +160 -0
omniload/src/google_sheets/helpers/data_processing.py +316 -0
omniload/src/gorgias/__init__.py +595 -0
omniload/src/gorgias/helpers.py +166 -0
omniload/src/hostaway/__init__.py +302 -0
omniload/src/hostaway/client.py +288 -0
omniload/src/http/__init__.py +38 -0
omniload/src/http/readers.py +146 -0
omniload/src/http_client.py +24 -0
omniload/src/hubspot/__init__.py +800 -0
omniload/src/hubspot/helpers.py +417 -0
omniload/src/hubspot/settings.py +329 -0
omniload/src/indeed/__init__.py +153 -0
omniload/src/indeed/helpers.py +228 -0
omniload/src/influxdb/__init__.py +46 -0
omniload/src/influxdb/client.py +34 -0
omniload/src/intercom/__init__.py +142 -0
omniload/src/intercom/helpers.py +674 -0
omniload/src/intercom/settings.py +279 -0
omniload/src/isoc_pulse/__init__.py +159 -0
omniload/src/jira_source/__init__.py +377 -0
omniload/src/jira_source/helpers.py +510 -0
omniload/src/jira_source/settings.py +184 -0
omniload/src/kafka/__init__.py +120 -0
omniload/src/kafka/helpers.py +241 -0
omniload/src/kinesis/__init__.py +153 -0
omniload/src/kinesis/helpers.py +96 -0
omniload/src/klaviyo/__init__.py +237 -0
omniload/src/klaviyo/client.py +212 -0
omniload/src/klaviyo/helpers.py +19 -0
omniload/src/linear/__init__.py +634 -0
omniload/src/linear/helpers.py +111 -0
omniload/src/linkedin_ads/__init__.py +266 -0
omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
omniload/src/linkedin_ads/helpers.py +246 -0
omniload/src/loader.py +69 -0
omniload/src/mailchimp/__init__.py +126 -0
omniload/src/mailchimp/helpers.py +226 -0
omniload/src/mailchimp/settings.py +164 -0
omniload/src/masking.py +344 -0
omniload/src/mixpanel/__init__.py +62 -0
omniload/src/mixpanel/client.py +104 -0
omniload/src/monday/__init__.py +246 -0
omniload/src/monday/helpers.py +392 -0
omniload/src/monday/settings.py +325 -0
omniload/src/mongodb/__init__.py +281 -0
omniload/src/mongodb/helpers.py +975 -0
omniload/src/notion/__init__.py +69 -0
omniload/src/notion/helpers/__init__.py +14 -0
omniload/src/notion/helpers/client.py +178 -0
omniload/src/notion/helpers/database.py +92 -0
omniload/src/notion/settings.py +17 -0
omniload/src/partition.py +32 -0
omniload/src/personio/__init__.py +345 -0
omniload/src/personio/helpers.py +100 -0
omniload/src/phantombuster/__init__.py +65 -0
omniload/src/phantombuster/client.py +87 -0
omniload/src/pinterest/__init__.py +82 -0
omniload/src/pipedrive/__init__.py +212 -0
omniload/src/pipedrive/helpers/__init__.py +37 -0
omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
omniload/src/pipedrive/helpers/pages.py +129 -0
omniload/src/pipedrive/settings.py +41 -0
omniload/src/pipedrive/typing.py +17 -0
omniload/src/plusvibeai/__init__.py +335 -0
omniload/src/plusvibeai/helpers.py +544 -0
omniload/src/plusvibeai/settings.py +252 -0
omniload/src/primer/__init__.py +45 -0
omniload/src/primer/helpers.py +79 -0
omniload/src/quickbooks/__init__.py +117 -0
omniload/src/reddit_ads/__init__.py +183 -0
omniload/src/reddit_ads/helpers.py +232 -0
omniload/src/resource.py +40 -0
omniload/src/revenuecat/__init__.py +83 -0
omniload/src/revenuecat/helpers.py +237 -0
omniload/src/salesforce/__init__.py +170 -0
omniload/src/salesforce/helpers.py +78 -0
omniload/src/shopify/__init__.py +1953 -0
omniload/src/shopify/exceptions.py +17 -0
omniload/src/shopify/helpers.py +202 -0
omniload/src/shopify/settings.py +19 -0
omniload/src/slack/__init__.py +290 -0
omniload/src/slack/helpers.py +218 -0
omniload/src/slack/settings.py +36 -0
omniload/src/smartsheets/__init__.py +82 -0
omniload/src/snapchat_ads/__init__.py +455 -0
omniload/src/snapchat_ads/client.py +72 -0
omniload/src/snapchat_ads/helpers.py +630 -0
omniload/src/snapchat_ads/settings.py +130 -0
omniload/src/socrata_source/__init__.py +83 -0
omniload/src/socrata_source/helpers.py +85 -0
omniload/src/socrata_source/settings.py +8 -0
omniload/src/solidgate/__init__.py +219 -0
omniload/src/solidgate/helpers.py +154 -0
omniload/src/sources.py +5408 -0
omniload/src/sql_database/__init__.py +0 -0
omniload/src/sql_database/callbacks.py +66 -0
omniload/src/stripe_analytics/__init__.py +183 -0
omniload/src/stripe_analytics/helpers.py +386 -0
omniload/src/stripe_analytics/settings.py +80 -0
omniload/src/table_definition.py +15 -0
omniload/src/testdata/fakebqcredentials.json +14 -0
omniload/src/tiktok_ads/__init__.py +150 -0
omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
omniload/src/time.py +11 -0
omniload/src/trustpilot/__init__.py +48 -0
omniload/src/trustpilot/client.py +48 -0
omniload/src/version.py +6 -0
omniload/src/wise/__init__.py +68 -0
omniload/src/wise/client.py +63 -0
omniload/src/zendesk/__init__.py +480 -0
omniload/src/zendesk/helpers/__init__.py +39 -0
omniload/src/zendesk/helpers/api_helpers.py +119 -0
omniload/src/zendesk/helpers/credentials.py +68 -0
omniload/src/zendesk/helpers/talk_api.py +132 -0
omniload/src/zendesk/settings.py +71 -0
omniload/src/zoom/__init__.py +99 -0
omniload/src/zoom/helpers.py +102 -0
omniload/testdata/.gitignore +2 -0
omniload/testdata/create_replace.csv +21 -0
omniload/testdata/delete_insert_expected.csv +6 -0
omniload/testdata/delete_insert_part1.csv +5 -0
omniload/testdata/delete_insert_part2.csv +6 -0
omniload/testdata/merge_expected.csv +5 -0
omniload/testdata/merge_part1.csv +4 -0
omniload/testdata/merge_part2.csv +5 -0
omniload/tests/unit/test_smartsheets.py +133 -0
omniload-0.0.0.dev0.dist-info/METADATA +439 -0
omniload-0.0.0.dev0.dist-info/RECORD +218 -0
omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0

omniload/src/google_sheets/__init__.py ADDED Viewed

@@ -0,0 +1,166 @@
+# Copyright 2022-2025 ScaleVector
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Loads Google Sheets data from tabs, named and explicit ranges. Contains the main source functions."""
+from typing import Iterable, Sequence, Union
+import dlt
+from dlt.common import logger
+from dlt.sources import DltResource
+from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
+from .helpers import api_calls
+from .helpers.api_calls import api_auth
+from .helpers.data_processing import (
+    get_data_types,
+    get_range_headers,
+    get_spreadsheet_id,
+    process_range,
+)
+@dlt.source
+def google_spreadsheet(
+    spreadsheet_url_or_id: str = dlt.config.value,
+    range_names: Sequence[str] = dlt.config.value,
+    credentials: Union[
+        GcpOAuthCredentials, GcpServiceAccountCredentials
+    ] = dlt.secrets.value,
+    get_sheets: bool = False,
+    get_named_ranges: bool = True,
+    max_api_retries: int = 5,
+) -> Iterable[DltResource]:
+    """
+    The source for the dlt pipeline. It returns the following resources:
+    - 1 dlt resource for every range in range_names.
+    - Optionally, dlt resources for all sheets inside the spreadsheet and all named ranges inside the spreadsheet.
+    Args:
+        spreadsheet_url_or_id (str): The ID or URL of the spreadsheet.
+        range_names (Sequence[str]): A list of ranges in the spreadsheet in the format used by Google Sheets. Accepts Named Ranges and Sheets (tabs) names.
+            These are the ranges to be converted into tables.
+        credentials (Union[GcpServiceAccountCredentials, GcpOAuthCredentials]): GCP credentials to the account
+            with Google Sheets API access, defined in dlt.secrets.
+        get_sheets (bool, optional): If True, load all the sheets inside the spreadsheet into the database.
+            Defaults to False.
+        get_named_ranges (bool, optional): If True, load all the named ranges inside the spreadsheet into the database.
+            Defaults to True.
+        max_api_retries (int, optional): Max number of retires to google sheets API. Actual behavior is internal to google client.
+    Yields:
+        Iterable[DltResource]: List of dlt resources.
+    """
+    # authenticate to the service using the helper function
+    service = api_auth(credentials, max_api_retries=max_api_retries)
+    # get spreadsheet id from url or id
+    spreadsheet_id = get_spreadsheet_id(spreadsheet_url_or_id)
+    all_range_names = set(range_names or [])
+    # if no explicit ranges, get sheets and named ranges from metadata
+    # get metadata with list of sheets and named ranges in the spreadsheet
+    sheet_names, named_ranges, spreadsheet_title = api_calls.get_known_range_names(
+        spreadsheet_id=spreadsheet_id, service=service
+    )
+    if not range_names:
+        if get_sheets:
+            all_range_names.update(sheet_names)
+        if get_named_ranges:
+            all_range_names.update(named_ranges)
+    # first we get all data for all the ranges (explicit or named)
+    all_range_data = api_calls.get_data_for_ranges(
+        service=service,
+        spreadsheet_id=spreadsheet_id,
+        range_names=list(all_range_names),
+    )
+    assert len(all_range_names) == len(all_range_data), (
+        "Google Sheets API must return values for all requested ranges"
+    )
+    # get metadata for two first rows of each range
+    # first should contain headers
+    # second row contains data which we'll use to sample data types.
+    # google sheets return datetime and date types as lotus notes serial number. which is just a float so we cannot infer the correct types just from the data
+    # warn and remove empty ranges
+    range_data = []
+    metadata_table = []
+    for name, parsed_range, meta_range, values in all_range_data:
+        # # pass all ranges to spreadsheet info - including empty
+        # metadata_table.append(
+        #     {
+        #         "spreadsheet_id": spreadsheet_id,
+        #         "title": spreadsheet_title,
+        #         "range_name": name,
+        #         "range": str(parsed_range),
+        #         "range_parsed": parsed_range._asdict(),
+        #         "skipped": True,
+        #     }
+        # )
+        if values is None or len(values) == 0:
+            logger.warning(f"Range {name} does not contain any data. Skipping.")
+            continue
+        if len(values) == 1:
+            logger.warning(f"Range {name} contain only 1 row of data. Skipping.")
+            continue
+        if len(values[0]) == 0:
+            logger.warning(
+                f"First row of range {name} does not contain data. Skipping."
+            )
+            continue
+        # metadata_table[-1]["skipped"] = False
+        range_data.append((name, parsed_range, meta_range, values))
+    meta_values = api_calls.get_meta_for_ranges(
+        service, spreadsheet_id, [str(data[2]) for data in range_data]
+    )
+    for name, parsed_range, _, values in range_data:
+        logger.info(f"Processing range {parsed_range} with name {name}")
+        # here is a tricky part due to how Google Sheets API returns the metadata. We are not able to directly pair the input range names with returned metadata objects
+        # instead metadata objects are grouped by sheet names, still each group order preserves the order of input ranges
+        # so for each range we get a sheet name, we look for the metadata group for that sheet and then we consume first object on that list with pop
+        metadata = next(
+            sheet
+            for sheet in meta_values["sheets"]
+            if sheet["properties"]["title"] == parsed_range.sheet_name
+        )["data"].pop(0)
+        headers_metadata = metadata["rowData"][0]["values"]
+        headers = get_range_headers(headers_metadata, name)
+        if headers is None:
+            # generate automatic headers and treat the first row as data
+            headers = [f"col_{idx + 1}" for idx in range(len(headers_metadata))]
+            data_row_metadata = headers_metadata
+            rows_data = values[0:]
+            logger.warning(
+                f"Using automatic headers. WARNING: first row of the range {name} will be used as data!"
+            )
+        else:
+            # first row contains headers and is skipped
+            data_row_metadata = metadata["rowData"][1]["values"]
+            rows_data = values[1:]
+        data_types = get_data_types(data_row_metadata)
+        yield dlt.resource(
+            process_range(rows_data, headers=headers, data_types=data_types),
+            name=name,
+            write_disposition="replace",
+        )
+    yield dlt.resource(
+        metadata_table,
+        write_disposition="merge",
+        name="spreadsheet_info",
+        merge_key="spreadsheet_id",
+    )

omniload/src/google_sheets/helpers/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright 2022-2025 ScaleVector
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Google Sheets source helpers"""

omniload/src/google_sheets/helpers/api_calls.py ADDED Viewed

@@ -0,0 +1,160 @@
+# Copyright 2022-2025 ScaleVector
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains helper functions to extract data from spreadsheet API"""
+from typing import Any, List, Tuple
+from dlt.common.exceptions import MissingDependencyException
+from dlt.common.typing import DictStrAny
+from dlt.sources.credentials import GcpCredentials, GcpOAuthCredentials
+from dlt.sources.helpers.requests.retry import DEFAULT_RETRY_STATUS
+from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
+from .data_processing import ParsedRange, trim_range_top_left
+try:
+    from apiclient.discovery import Resource, build
+except ImportError:
+    raise MissingDependencyException("Google API Client", ["google-api-python-client"])
+def is_retry_status_code(exception: BaseException) -> bool:
+    """Retry condition on HttpError"""
+    from googleapiclient.errors import HttpError  # type: ignore
+    # print(f"RETRY ON {str(HttpError)} = {isinstance(exception, HttpError) and exception.resp.status in DEFAULT_RETRY_STATUS}")
+    # if isinstance(exception, HttpError):
+    #     print(exception.resp.status)
+    #     print(DEFAULT_RETRY_STATUS)
+    return (
+        isinstance(exception, HttpError)
+        and exception.resp.status in DEFAULT_RETRY_STATUS
+    )
+retry_deco = retry(
+    # Retry if it's a rate limit error (HTTP 429)
+    retry=retry_if_exception(is_retry_status_code),
+    # Use exponential backoff for the waiting time between retries, starting with 5 seconds
+    wait=wait_exponential(multiplier=1.5, min=5, max=120),
+    # Stop retrying after 10 attempts
+    stop=stop_after_attempt(10),
+    # Print out the retrying details
+    reraise=True,
+)
+def api_auth(credentials: GcpCredentials, max_api_retries: int) -> Resource:
+    """
+    Uses GCP credentials to authenticate with Google Sheets API.
+    Args:
+        credentials (GcpCredentials): Credentials needed to log in to GCP.
+        max_api_retries (int): Max number of retires to google sheets API. Actual behavior is internal to google client.
+    Returns:
+        Resource: Object needed to make API calls to Google Sheets API.
+    """
+    if isinstance(credentials, GcpOAuthCredentials):
+        credentials.auth("https://www.googleapis.com/auth/spreadsheets.readonly")
+    # Build the service object for Google sheets api.
+    service = build(
+        "sheets",
+        "v4",
+        credentials=credentials.to_native_credentials(),
+        num_retries=max_api_retries,
+    )
+    return service
+@retry_deco
+def get_meta_for_ranges(
+    service: Resource, spreadsheet_id: str, range_names: List[str]
+) -> Any:
+    """Retrieves `spreadsheet_id` cell metadata for `range_names`"""
+    return (
+        service.spreadsheets()
+        .get(
+            spreadsheetId=spreadsheet_id,
+            ranges=range_names,
+            includeGridData=True,
+        )
+        .execute()
+    )
+@retry_deco
+def get_known_range_names(
+    spreadsheet_id: str, service: Resource
+) -> Tuple[List[str], List[str], str]:
+    """
+    Retrieves spreadsheet metadata and extracts a list of sheet names and named ranges
+    Args:
+        spreadsheet_id (str): The ID of the spreadsheet.
+        service (Resource): Resource object used to make API calls to Google Sheets API.
+    Returns:
+        Tuple[List[str], List[str], str] sheet names, named ranges, spreadheet title
+    """
+    metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
+    sheet_names: List[str] = [s["properties"]["title"] for s in metadata["sheets"]]
+    named_ranges: List[str] = [r["name"] for r in metadata.get("namedRanges", {})]
+    title: str = metadata["properties"]["title"]
+    return sheet_names, named_ranges, title
+@retry_deco
+def get_data_for_ranges(
+    service: Resource, spreadsheet_id: str, range_names: List[str]
+) -> List[Tuple[str, ParsedRange, ParsedRange, List[List[Any]]]]:
+    """
+    Calls Google Sheets API to get data in a batch. This is the most efficient way to get data for multiple ranges inside a spreadsheet.
+    Args:
+        service (Resource): Object to make API calls to Google Sheets.
+        spreadsheet_id (str): The ID of the spreadsheet.
+        range_names (List[str]): List of range names.
+    Returns:
+        List[DictStrAny]: A list of ranges with data in the same order as `range_names`
+    """
+    range_batch_resp = (
+        service.spreadsheets()
+        .values()
+        .batchGet(
+            spreadsheetId=spreadsheet_id,
+            ranges=range_names,
+            # un formatted returns typed values
+            valueRenderOption="UNFORMATTED_VALUE",
+            # will return formatted dates as a serial number
+            dateTimeRenderOption="SERIAL_NUMBER",
+        )
+        .execute()
+    )
+    # if there are not ranges to be loaded, there's no "valueRanges"
+    range_batch: List[DictStrAny] = range_batch_resp.get("valueRanges", [])
+    # trim the empty top rows and columns from the left
+    rv = []
+    for name, range_ in zip(range_names, range_batch):
+        parsed_range = ParsedRange.parse_range(range_["range"])
+        values: List[List[Any]] = range_.get("values", None)
+        if values:
+            parsed_range, values = trim_range_top_left(parsed_range, values)
+        # create a new range to get first two rows
+        meta_range = parsed_range._replace(end_row=parsed_range.start_row + 1)
+        # print(f"{name}:{parsed_range}:{meta_range}")
+        rv.append((name, parsed_range, meta_range, values))
+    return rv

omniload/src/google_sheets/helpers/data_processing.py ADDED Viewed

@@ -0,0 +1,316 @@
+# Copyright 2022-2025 ScaleVector
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This is a helper module that contains function which validate and process data"""
+import re
+from typing import Any, Iterator, List, NamedTuple, Tuple, Union
+import dlt
+from dlt.common import logger, pendulum
+from dlt.common.data_types import TDataType
+from dlt.common.typing import DictStrAny
+# this string comes before the id
+URL_ID_IDENTIFIER = "d"
+# time info
+SECONDS_PER_DAY = 86400
+# TIMEZONE info
+DLT_TIMEZONE = "UTC"
+# number of seconds from UNIX timestamp origin (1st Jan 1970) to serial number origin (30th Dec 1899)
+TIMESTAMP_CONST = -2209161600.0
+# compiled regex to extract ranges
+RE_PARSE_RANGE = re.compile(
+    r"^(?:(?P<sheet>[\'\w\s]+)!)?(?P<start_col>[A-Z]+)(?P<start_row>\d+):(?P<end_col>[A-Z]+)(?P<end_row>\d+)$"
+)
+class ParsedRange(NamedTuple):
+    sheet_name: str
+    start_col: str
+    start_row: int
+    end_col: str
+    end_row: int
+    @classmethod
+    def parse_range(cls, s: str) -> "ParsedRange":
+        match = RE_PARSE_RANGE.match(s)
+        if match:
+            parsed_dict = match.groupdict()
+            return ParsedRange(
+                parsed_dict["sheet"].strip("'"),
+                parsed_dict["start_col"],
+                int(parsed_dict["start_row"]),
+                parsed_dict["end_col"],
+                int(parsed_dict["end_row"]),
+            )
+        else:
+            raise ValueError(s)
+    def __str__(self) -> str:
+        return f"{self.sheet_name}!{self.start_col}{self.start_row}:{self.end_col}{self.end_row}"
+    @staticmethod
+    def shift_column(col: str, shift: int) -> str:
+        """
+        Shift a Google Sheets column string by a given number of positions.
+        Parameters:
+        col (str): The original column string.
+        shift (int): The number of positions to shift the column.
+        Returns:
+        str: The new column string after shifting.
+        """
+        # Convert column string to column index (1-indexed)
+        col_num = 0
+        for i, char in enumerate(reversed(col)):
+            col_num += (ord(char.upper()) - 65 + 1) * (26**i)
+        # Shift the column index
+        col_num += shift
+        # Convert back to column string
+        col_str = ""
+        while col_num > 0:
+            col_num, remainder = divmod(col_num - 1, 26)
+            col_str = chr(65 + remainder) + col_str
+        return col_str
+def get_spreadsheet_id(url_or_id: str) -> str:
+    """
+    Receives an ID or URL to a Google Spreadsheet and returns the spreadsheet ID as a string.
+    Args:
+        url_or_id (str): The ID or URL of the spreadsheet.
+    Returns:
+        str: The spreadsheet ID as a string.
+    """
+    # check if this is an url: http or https in it
+    if re.match(r"http://|https://", url_or_id):
+        # process url
+        spreadsheet_id = extract_spreadsheet_id_from_url(url_or_id)
+        return spreadsheet_id
+    else:
+        # just return id
+        return url_or_id
+def extract_spreadsheet_id_from_url(url: str) -> str:
+    """
+    Takes a URL to a Google spreadsheet and computes the spreadsheet ID from it according to the spreadsheet URL formula: https://docs.google.com/spreadsheets/d/<spreadsheet_id>/edit.
+    If the URL is not formatted correctly, a ValueError will be raised.
+    Args:
+        url (str): The URL to the spreadsheet.
+    Returns:
+        str: The spreadsheet ID as a string.
+    Raises:
+        ValueError: If the URL is not properly formatted.
+    """
+    # split on the '/'
+    parts = url.split("/")
+    # loop through parts
+    for i in range(len(parts)):
+        if parts[i] == URL_ID_IDENTIFIER and i + 1 < len(parts):
+            # if the id part is left empty then the url is not formatted correctly
+            if parts[i + 1] == "":
+                raise ValueError(f"Spreadsheet ID is an empty string in url: {url}")
+            else:
+                return parts[i + 1]
+    raise ValueError(f"Invalid URL. Cannot find spreadsheet ID in url: {url}")
+def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> List[str]:
+    """
+    Retrieves the headers for columns from the metadata of a range.
+    Args:
+        headers_metadata (List[DictStrAny]): Metadata for the first 2 rows of a range.
+        range_name (str): The name of the range as appears in the metadata.
+    Returns:
+        List[str]: A list of headers.
+    """
+    headers = []
+    for idx, header in enumerate(headers_metadata):
+        header_val: str = None
+        if header:
+            if "stringValue" in header.get("effectiveValue", {}):
+                header_val = header["formattedValue"]
+            else:
+                header_val = header.get("formattedValue", None)
+                # if there's no formatted value then the cell is empty (no empty string as well!) in that case add auto name and move on
+                if header_val is None:
+                    header_val = str(f"col_{idx + 1}")
+                else:
+                    logger.warning(
+                        f"In range {range_name}, header value: {header_val} at position {idx + 1} is not a string!"
+                    )
+                    return None
+        else:
+            logger.warning(
+                f"In range {range_name}, header at position {idx + 1} is not missing!"
+            )
+            return None
+        headers.append(header_val)
+    # make sure that headers are unique, first normalize the headers
+    header_mappings = {
+        h: dlt.current.source_schema().naming.normalize_identifier(h) for h in headers
+    }
+    if len(set(header_mappings.values())) != len(headers):
+        logger.warning(
+            "Header names must be unique otherwise you risk that data in columns with duplicate header names to be lost. Note that several destinations require "
+            + "that column names are normalized ie. must be lower or upper case and without special characters. dlt normalizes those names for you but it may "
+            + f"result in duplicate column names. Headers in range {range_name} are mapped as follows: "
+            + ", ".join([f"{k}->{v}" for k, v in header_mappings.items()])
+            + ". Please use make your header names unique."
+        )
+        return None
+    return headers
+def get_data_types(data_row_metadata: List[DictStrAny]) -> List[TDataType]:
+    """
+    Determines if each column in the first line of a range contains datetime objects.
+    Args:
+        data_row_metadata (List[DictStrAny]): Metadata of the first row of data
+    Returns:
+        List[TDataType]: "timestamp" or "data" indicating the date/time type for a column, otherwise None
+    """
+    # get data for 1st column and process them, if empty just return an empty list
+    try:
+        data_types: List[TDataType] = [None] * len(data_row_metadata)
+        for idx, val_dict in enumerate(data_row_metadata):
+            try:
+                data_type = val_dict["effectiveFormat"]["numberFormat"]["type"]
+                if data_type in ["DATE_TIME", "TIME"]:
+                    data_types[idx] = "timestamp"
+                elif data_type == "DATE":
+                    data_types[idx] = "date"
+            except KeyError:
+                pass
+        return data_types
+    except IndexError:
+        return []
+def serial_date_to_datetime(
+    serial_number: Union[int, float], data_type: TDataType
+) -> Union[pendulum.DateTime, pendulum.Date]:
+    """
+    Converts a serial number to a datetime (if input is float) or date (if input is int).
+    Args:
+        serial_number (Union[int, float, str, bool]): The Lotus Notes serial number
+    Returns:
+        Union[pendulum.DateTime, str, bool]: The converted datetime object, or the original value if conversion fails.
+    """
+    # To get the seconds passed since the start date of serial numbers we round the product of the number of seconds in a day and the serial number
+    conv_datetime: pendulum.DateTime = pendulum.from_timestamp(
+        0, DLT_TIMEZONE
+    ) + pendulum.duration(
+        seconds=TIMESTAMP_CONST + round(SECONDS_PER_DAY * serial_number)
+    )
+    # int values are dates, float values are datetimes
+    if data_type == "date":
+        return conv_datetime.date()  # type: ignore[no-any-return]
+    return conv_datetime
+def process_range(
+    sheet_values: List[List[Any]], headers: List[str], data_types: List[TDataType]
+) -> Iterator[DictStrAny]:
+    """
+    Yields lists of values as dictionaries, converts data times and handles empty rows and cells. Please note:
+    1. empty rows get ignored
+    2. empty cells are converted to None (and then to NULL by dlt)
+    3. data in columns without headers will be dropped
+    Args:
+        sheet_val (List[List[Any]]): range values without the header row
+        headers (List[str]): names of the headers
+        data_types: List[TDataType]: "timestamp" and "date" or None for each column
+    Yields:
+        DictStrAny: A dictionary version of the table. It generates a dictionary of the type {header: value} for every row.
+    """
+    for row in sheet_values:
+        # empty row; skip
+        if not row:
+            continue
+        table_dict = {}
+        # process both rows and check for differences to spot dates
+        for val, header, data_type in zip(row, headers, data_types):
+            # 3 main cases: null cell value, datetime value, every other value
+            # handle null values properly. Null cell values are returned as empty strings, this will cause dlt to create new columns and fill them with empty strings
+            if val == "":
+                fill_val = None
+            elif data_type in ["timestamp", "date"]:
+                # the datetimes are inferred from first row of data. if next rows have inconsistent data types - pass the values to dlt to deal with them
+                if not isinstance(val, (int, float)) or isinstance(val, bool):
+                    fill_val = val
+                else:
+                    fill_val = serial_date_to_datetime(val, data_type)
+            else:
+                fill_val = val
+            table_dict[header] = fill_val
+        yield table_dict
+def trim_range_top_left(
+    parsed_range: ParsedRange, range_values: List[List[Any]]
+) -> Tuple[ParsedRange, List[List[Any]]]:
+    # skip empty rows and then empty columns
+    # skip empty rows
+    shift_x = 0
+    for row in range_values:
+        if row:
+            break
+        else:
+            shift_x += 1
+    if shift_x > 0:
+        range_values = range_values[shift_x:]
+    # skip empty columns
+    shift_y = 0
+    if len(range_values) > 0:
+        for col in range_values[0]:
+            if col == "":
+                shift_y += 1
+            else:
+                break
+        if shift_y > 0:
+            # skip all columns
+            for idx, row in enumerate(range_values):
+                range_values[idx] = row[shift_y:]
+    parsed_range = parsed_range._replace(
+        start_row=parsed_range.start_row + shift_x,
+        start_col=ParsedRange.shift_column(parsed_range.start_col, shift_y),
+    )
+    return parsed_range, range_values