omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,166 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Loads Google Sheets data from tabs, named and explicit ranges. Contains the main source functions."""
16
+
17
+ from typing import Iterable, Sequence, Union
18
+
19
+ import dlt
20
+ from dlt.common import logger
21
+ from dlt.sources import DltResource
22
+ from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
23
+
24
+ from .helpers import api_calls
25
+ from .helpers.api_calls import api_auth
26
+ from .helpers.data_processing import (
27
+ get_data_types,
28
+ get_range_headers,
29
+ get_spreadsheet_id,
30
+ process_range,
31
+ )
32
+
33
+
34
+ @dlt.source
35
+ def google_spreadsheet(
36
+ spreadsheet_url_or_id: str = dlt.config.value,
37
+ range_names: Sequence[str] = dlt.config.value,
38
+ credentials: Union[
39
+ GcpOAuthCredentials, GcpServiceAccountCredentials
40
+ ] = dlt.secrets.value,
41
+ get_sheets: bool = False,
42
+ get_named_ranges: bool = True,
43
+ max_api_retries: int = 5,
44
+ ) -> Iterable[DltResource]:
45
+ """
46
+ The source for the dlt pipeline. It returns the following resources:
47
+ - 1 dlt resource for every range in range_names.
48
+ - Optionally, dlt resources for all sheets inside the spreadsheet and all named ranges inside the spreadsheet.
49
+
50
+ Args:
51
+ spreadsheet_url_or_id (str): The ID or URL of the spreadsheet.
52
+ range_names (Sequence[str]): A list of ranges in the spreadsheet in the format used by Google Sheets. Accepts Named Ranges and Sheets (tabs) names.
53
+ These are the ranges to be converted into tables.
54
+ credentials (Union[GcpServiceAccountCredentials, GcpOAuthCredentials]): GCP credentials to the account
55
+ with Google Sheets API access, defined in dlt.secrets.
56
+ get_sheets (bool, optional): If True, load all the sheets inside the spreadsheet into the database.
57
+ Defaults to False.
58
+ get_named_ranges (bool, optional): If True, load all the named ranges inside the spreadsheet into the database.
59
+ Defaults to True.
60
+ max_api_retries (int, optional): Max number of retires to google sheets API. Actual behavior is internal to google client.
61
+
62
+ Yields:
63
+ Iterable[DltResource]: List of dlt resources.
64
+ """
65
+ # authenticate to the service using the helper function
66
+ service = api_auth(credentials, max_api_retries=max_api_retries)
67
+ # get spreadsheet id from url or id
68
+ spreadsheet_id = get_spreadsheet_id(spreadsheet_url_or_id)
69
+ all_range_names = set(range_names or [])
70
+ # if no explicit ranges, get sheets and named ranges from metadata
71
+ # get metadata with list of sheets and named ranges in the spreadsheet
72
+ sheet_names, named_ranges, spreadsheet_title = api_calls.get_known_range_names(
73
+ spreadsheet_id=spreadsheet_id, service=service
74
+ )
75
+ if not range_names:
76
+ if get_sheets:
77
+ all_range_names.update(sheet_names)
78
+ if get_named_ranges:
79
+ all_range_names.update(named_ranges)
80
+
81
+ # first we get all data for all the ranges (explicit or named)
82
+ all_range_data = api_calls.get_data_for_ranges(
83
+ service=service,
84
+ spreadsheet_id=spreadsheet_id,
85
+ range_names=list(all_range_names),
86
+ )
87
+ assert len(all_range_names) == len(all_range_data), (
88
+ "Google Sheets API must return values for all requested ranges"
89
+ )
90
+
91
+ # get metadata for two first rows of each range
92
+ # first should contain headers
93
+ # second row contains data which we'll use to sample data types.
94
+ # google sheets return datetime and date types as lotus notes serial number. which is just a float so we cannot infer the correct types just from the data
95
+
96
+ # warn and remove empty ranges
97
+ range_data = []
98
+ metadata_table = []
99
+ for name, parsed_range, meta_range, values in all_range_data:
100
+ # # pass all ranges to spreadsheet info - including empty
101
+ # metadata_table.append(
102
+ # {
103
+ # "spreadsheet_id": spreadsheet_id,
104
+ # "title": spreadsheet_title,
105
+ # "range_name": name,
106
+ # "range": str(parsed_range),
107
+ # "range_parsed": parsed_range._asdict(),
108
+ # "skipped": True,
109
+ # }
110
+ # )
111
+ if values is None or len(values) == 0:
112
+ logger.warning(f"Range {name} does not contain any data. Skipping.")
113
+ continue
114
+ if len(values) == 1:
115
+ logger.warning(f"Range {name} contain only 1 row of data. Skipping.")
116
+ continue
117
+ if len(values[0]) == 0:
118
+ logger.warning(
119
+ f"First row of range {name} does not contain data. Skipping."
120
+ )
121
+ continue
122
+ # metadata_table[-1]["skipped"] = False
123
+ range_data.append((name, parsed_range, meta_range, values))
124
+
125
+ meta_values = api_calls.get_meta_for_ranges(
126
+ service, spreadsheet_id, [str(data[2]) for data in range_data]
127
+ )
128
+ for name, parsed_range, _, values in range_data:
129
+ logger.info(f"Processing range {parsed_range} with name {name}")
130
+ # here is a tricky part due to how Google Sheets API returns the metadata. We are not able to directly pair the input range names with returned metadata objects
131
+ # instead metadata objects are grouped by sheet names, still each group order preserves the order of input ranges
132
+ # so for each range we get a sheet name, we look for the metadata group for that sheet and then we consume first object on that list with pop
133
+ metadata = next(
134
+ sheet
135
+ for sheet in meta_values["sheets"]
136
+ if sheet["properties"]["title"] == parsed_range.sheet_name
137
+ )["data"].pop(0)
138
+
139
+ headers_metadata = metadata["rowData"][0]["values"]
140
+ headers = get_range_headers(headers_metadata, name)
141
+ if headers is None:
142
+ # generate automatic headers and treat the first row as data
143
+ headers = [f"col_{idx + 1}" for idx in range(len(headers_metadata))]
144
+ data_row_metadata = headers_metadata
145
+ rows_data = values[0:]
146
+ logger.warning(
147
+ f"Using automatic headers. WARNING: first row of the range {name} will be used as data!"
148
+ )
149
+ else:
150
+ # first row contains headers and is skipped
151
+ data_row_metadata = metadata["rowData"][1]["values"]
152
+ rows_data = values[1:]
153
+
154
+ data_types = get_data_types(data_row_metadata)
155
+
156
+ yield dlt.resource(
157
+ process_range(rows_data, headers=headers, data_types=data_types),
158
+ name=name,
159
+ write_disposition="replace",
160
+ )
161
+ yield dlt.resource(
162
+ metadata_table,
163
+ write_disposition="merge",
164
+ name="spreadsheet_info",
165
+ merge_key="spreadsheet_id",
166
+ )
@@ -0,0 +1,15 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Google Sheets source helpers"""
@@ -0,0 +1,160 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Contains helper functions to extract data from spreadsheet API"""
16
+
17
+ from typing import Any, List, Tuple
18
+
19
+ from dlt.common.exceptions import MissingDependencyException
20
+ from dlt.common.typing import DictStrAny
21
+ from dlt.sources.credentials import GcpCredentials, GcpOAuthCredentials
22
+ from dlt.sources.helpers.requests.retry import DEFAULT_RETRY_STATUS
23
+ from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
24
+
25
+ from .data_processing import ParsedRange, trim_range_top_left
26
+
27
+ try:
28
+ from apiclient.discovery import Resource, build
29
+ except ImportError:
30
+ raise MissingDependencyException("Google API Client", ["google-api-python-client"])
31
+
32
+
33
+ def is_retry_status_code(exception: BaseException) -> bool:
34
+ """Retry condition on HttpError"""
35
+ from googleapiclient.errors import HttpError # type: ignore
36
+
37
+ # print(f"RETRY ON {str(HttpError)} = {isinstance(exception, HttpError) and exception.resp.status in DEFAULT_RETRY_STATUS}")
38
+ # if isinstance(exception, HttpError):
39
+ # print(exception.resp.status)
40
+ # print(DEFAULT_RETRY_STATUS)
41
+ return (
42
+ isinstance(exception, HttpError)
43
+ and exception.resp.status in DEFAULT_RETRY_STATUS
44
+ )
45
+
46
+
47
+ retry_deco = retry(
48
+ # Retry if it's a rate limit error (HTTP 429)
49
+ retry=retry_if_exception(is_retry_status_code),
50
+ # Use exponential backoff for the waiting time between retries, starting with 5 seconds
51
+ wait=wait_exponential(multiplier=1.5, min=5, max=120),
52
+ # Stop retrying after 10 attempts
53
+ stop=stop_after_attempt(10),
54
+ # Print out the retrying details
55
+ reraise=True,
56
+ )
57
+
58
+
59
+ def api_auth(credentials: GcpCredentials, max_api_retries: int) -> Resource:
60
+ """
61
+ Uses GCP credentials to authenticate with Google Sheets API.
62
+
63
+ Args:
64
+ credentials (GcpCredentials): Credentials needed to log in to GCP.
65
+ max_api_retries (int): Max number of retires to google sheets API. Actual behavior is internal to google client.
66
+
67
+ Returns:
68
+ Resource: Object needed to make API calls to Google Sheets API.
69
+ """
70
+ if isinstance(credentials, GcpOAuthCredentials):
71
+ credentials.auth("https://www.googleapis.com/auth/spreadsheets.readonly")
72
+ # Build the service object for Google sheets api.
73
+ service = build(
74
+ "sheets",
75
+ "v4",
76
+ credentials=credentials.to_native_credentials(),
77
+ num_retries=max_api_retries,
78
+ )
79
+ return service
80
+
81
+
82
+ @retry_deco
83
+ def get_meta_for_ranges(
84
+ service: Resource, spreadsheet_id: str, range_names: List[str]
85
+ ) -> Any:
86
+ """Retrieves `spreadsheet_id` cell metadata for `range_names`"""
87
+ return (
88
+ service.spreadsheets()
89
+ .get(
90
+ spreadsheetId=spreadsheet_id,
91
+ ranges=range_names,
92
+ includeGridData=True,
93
+ )
94
+ .execute()
95
+ )
96
+
97
+
98
+ @retry_deco
99
+ def get_known_range_names(
100
+ spreadsheet_id: str, service: Resource
101
+ ) -> Tuple[List[str], List[str], str]:
102
+ """
103
+ Retrieves spreadsheet metadata and extracts a list of sheet names and named ranges
104
+
105
+ Args:
106
+ spreadsheet_id (str): The ID of the spreadsheet.
107
+ service (Resource): Resource object used to make API calls to Google Sheets API.
108
+
109
+ Returns:
110
+ Tuple[List[str], List[str], str] sheet names, named ranges, spreadheet title
111
+ """
112
+ metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
113
+ sheet_names: List[str] = [s["properties"]["title"] for s in metadata["sheets"]]
114
+ named_ranges: List[str] = [r["name"] for r in metadata.get("namedRanges", {})]
115
+ title: str = metadata["properties"]["title"]
116
+ return sheet_names, named_ranges, title
117
+
118
+
119
+ @retry_deco
120
+ def get_data_for_ranges(
121
+ service: Resource, spreadsheet_id: str, range_names: List[str]
122
+ ) -> List[Tuple[str, ParsedRange, ParsedRange, List[List[Any]]]]:
123
+ """
124
+ Calls Google Sheets API to get data in a batch. This is the most efficient way to get data for multiple ranges inside a spreadsheet.
125
+
126
+ Args:
127
+ service (Resource): Object to make API calls to Google Sheets.
128
+ spreadsheet_id (str): The ID of the spreadsheet.
129
+ range_names (List[str]): List of range names.
130
+
131
+ Returns:
132
+ List[DictStrAny]: A list of ranges with data in the same order as `range_names`
133
+ """
134
+ range_batch_resp = (
135
+ service.spreadsheets()
136
+ .values()
137
+ .batchGet(
138
+ spreadsheetId=spreadsheet_id,
139
+ ranges=range_names,
140
+ # un formatted returns typed values
141
+ valueRenderOption="UNFORMATTED_VALUE",
142
+ # will return formatted dates as a serial number
143
+ dateTimeRenderOption="SERIAL_NUMBER",
144
+ )
145
+ .execute()
146
+ )
147
+ # if there are not ranges to be loaded, there's no "valueRanges"
148
+ range_batch: List[DictStrAny] = range_batch_resp.get("valueRanges", [])
149
+ # trim the empty top rows and columns from the left
150
+ rv = []
151
+ for name, range_ in zip(range_names, range_batch):
152
+ parsed_range = ParsedRange.parse_range(range_["range"])
153
+ values: List[List[Any]] = range_.get("values", None)
154
+ if values:
155
+ parsed_range, values = trim_range_top_left(parsed_range, values)
156
+ # create a new range to get first two rows
157
+ meta_range = parsed_range._replace(end_row=parsed_range.start_row + 1)
158
+ # print(f"{name}:{parsed_range}:{meta_range}")
159
+ rv.append((name, parsed_range, meta_range, values))
160
+ return rv
@@ -0,0 +1,316 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """This is a helper module that contains function which validate and process data"""
16
+
17
+ import re
18
+ from typing import Any, Iterator, List, NamedTuple, Tuple, Union
19
+
20
+ import dlt
21
+ from dlt.common import logger, pendulum
22
+ from dlt.common.data_types import TDataType
23
+ from dlt.common.typing import DictStrAny
24
+
25
+ # this string comes before the id
26
+ URL_ID_IDENTIFIER = "d"
27
+ # time info
28
+ SECONDS_PER_DAY = 86400
29
+ # TIMEZONE info
30
+ DLT_TIMEZONE = "UTC"
31
+ # number of seconds from UNIX timestamp origin (1st Jan 1970) to serial number origin (30th Dec 1899)
32
+ TIMESTAMP_CONST = -2209161600.0
33
+ # compiled regex to extract ranges
34
+ RE_PARSE_RANGE = re.compile(
35
+ r"^(?:(?P<sheet>[\'\w\s]+)!)?(?P<start_col>[A-Z]+)(?P<start_row>\d+):(?P<end_col>[A-Z]+)(?P<end_row>\d+)$"
36
+ )
37
+
38
+
39
+ class ParsedRange(NamedTuple):
40
+ sheet_name: str
41
+ start_col: str
42
+ start_row: int
43
+ end_col: str
44
+ end_row: int
45
+
46
+ @classmethod
47
+ def parse_range(cls, s: str) -> "ParsedRange":
48
+ match = RE_PARSE_RANGE.match(s)
49
+ if match:
50
+ parsed_dict = match.groupdict()
51
+ return ParsedRange(
52
+ parsed_dict["sheet"].strip("'"),
53
+ parsed_dict["start_col"],
54
+ int(parsed_dict["start_row"]),
55
+ parsed_dict["end_col"],
56
+ int(parsed_dict["end_row"]),
57
+ )
58
+ else:
59
+ raise ValueError(s)
60
+
61
+ def __str__(self) -> str:
62
+ return f"{self.sheet_name}!{self.start_col}{self.start_row}:{self.end_col}{self.end_row}"
63
+
64
+ @staticmethod
65
+ def shift_column(col: str, shift: int) -> str:
66
+ """
67
+ Shift a Google Sheets column string by a given number of positions.
68
+
69
+ Parameters:
70
+ col (str): The original column string.
71
+ shift (int): The number of positions to shift the column.
72
+
73
+ Returns:
74
+ str: The new column string after shifting.
75
+ """
76
+ # Convert column string to column index (1-indexed)
77
+ col_num = 0
78
+ for i, char in enumerate(reversed(col)):
79
+ col_num += (ord(char.upper()) - 65 + 1) * (26**i)
80
+
81
+ # Shift the column index
82
+ col_num += shift
83
+
84
+ # Convert back to column string
85
+ col_str = ""
86
+ while col_num > 0:
87
+ col_num, remainder = divmod(col_num - 1, 26)
88
+ col_str = chr(65 + remainder) + col_str
89
+
90
+ return col_str
91
+
92
+
93
+ def get_spreadsheet_id(url_or_id: str) -> str:
94
+ """
95
+ Receives an ID or URL to a Google Spreadsheet and returns the spreadsheet ID as a string.
96
+
97
+ Args:
98
+ url_or_id (str): The ID or URL of the spreadsheet.
99
+
100
+ Returns:
101
+ str: The spreadsheet ID as a string.
102
+ """
103
+
104
+ # check if this is an url: http or https in it
105
+ if re.match(r"http://|https://", url_or_id):
106
+ # process url
107
+ spreadsheet_id = extract_spreadsheet_id_from_url(url_or_id)
108
+ return spreadsheet_id
109
+ else:
110
+ # just return id
111
+ return url_or_id
112
+
113
+
114
+ def extract_spreadsheet_id_from_url(url: str) -> str:
115
+ """
116
+ Takes a URL to a Google spreadsheet and computes the spreadsheet ID from it according to the spreadsheet URL formula: https://docs.google.com/spreadsheets/d/<spreadsheet_id>/edit.
117
+ If the URL is not formatted correctly, a ValueError will be raised.
118
+
119
+ Args:
120
+ url (str): The URL to the spreadsheet.
121
+
122
+ Returns:
123
+ str: The spreadsheet ID as a string.
124
+
125
+ Raises:
126
+ ValueError: If the URL is not properly formatted.
127
+ """
128
+
129
+ # split on the '/'
130
+ parts = url.split("/")
131
+ # loop through parts
132
+ for i in range(len(parts)):
133
+ if parts[i] == URL_ID_IDENTIFIER and i + 1 < len(parts):
134
+ # if the id part is left empty then the url is not formatted correctly
135
+ if parts[i + 1] == "":
136
+ raise ValueError(f"Spreadsheet ID is an empty string in url: {url}")
137
+ else:
138
+ return parts[i + 1]
139
+ raise ValueError(f"Invalid URL. Cannot find spreadsheet ID in url: {url}")
140
+
141
+
142
+ def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> List[str]:
143
+ """
144
+ Retrieves the headers for columns from the metadata of a range.
145
+
146
+ Args:
147
+ headers_metadata (List[DictStrAny]): Metadata for the first 2 rows of a range.
148
+ range_name (str): The name of the range as appears in the metadata.
149
+
150
+ Returns:
151
+ List[str]: A list of headers.
152
+ """
153
+ headers = []
154
+ for idx, header in enumerate(headers_metadata):
155
+ header_val: str = None
156
+ if header:
157
+ if "stringValue" in header.get("effectiveValue", {}):
158
+ header_val = header["formattedValue"]
159
+ else:
160
+ header_val = header.get("formattedValue", None)
161
+ # if there's no formatted value then the cell is empty (no empty string as well!) in that case add auto name and move on
162
+ if header_val is None:
163
+ header_val = str(f"col_{idx + 1}")
164
+ else:
165
+ logger.warning(
166
+ f"In range {range_name}, header value: {header_val} at position {idx + 1} is not a string!"
167
+ )
168
+ return None
169
+ else:
170
+ logger.warning(
171
+ f"In range {range_name}, header at position {idx + 1} is not missing!"
172
+ )
173
+ return None
174
+ headers.append(header_val)
175
+
176
+ # make sure that headers are unique, first normalize the headers
177
+ header_mappings = {
178
+ h: dlt.current.source_schema().naming.normalize_identifier(h) for h in headers
179
+ }
180
+ if len(set(header_mappings.values())) != len(headers):
181
+ logger.warning(
182
+ "Header names must be unique otherwise you risk that data in columns with duplicate header names to be lost. Note that several destinations require "
183
+ + "that column names are normalized ie. must be lower or upper case and without special characters. dlt normalizes those names for you but it may "
184
+ + f"result in duplicate column names. Headers in range {range_name} are mapped as follows: "
185
+ + ", ".join([f"{k}->{v}" for k, v in header_mappings.items()])
186
+ + ". Please use make your header names unique."
187
+ )
188
+ return None
189
+
190
+ return headers
191
+
192
+
193
+ def get_data_types(data_row_metadata: List[DictStrAny]) -> List[TDataType]:
194
+ """
195
+ Determines if each column in the first line of a range contains datetime objects.
196
+
197
+ Args:
198
+ data_row_metadata (List[DictStrAny]): Metadata of the first row of data
199
+
200
+ Returns:
201
+ List[TDataType]: "timestamp" or "data" indicating the date/time type for a column, otherwise None
202
+ """
203
+
204
+ # get data for 1st column and process them, if empty just return an empty list
205
+ try:
206
+ data_types: List[TDataType] = [None] * len(data_row_metadata)
207
+ for idx, val_dict in enumerate(data_row_metadata):
208
+ try:
209
+ data_type = val_dict["effectiveFormat"]["numberFormat"]["type"]
210
+ if data_type in ["DATE_TIME", "TIME"]:
211
+ data_types[idx] = "timestamp"
212
+ elif data_type == "DATE":
213
+ data_types[idx] = "date"
214
+ except KeyError:
215
+ pass
216
+ return data_types
217
+ except IndexError:
218
+ return []
219
+
220
+
221
+ def serial_date_to_datetime(
222
+ serial_number: Union[int, float], data_type: TDataType
223
+ ) -> Union[pendulum.DateTime, pendulum.Date]:
224
+ """
225
+ Converts a serial number to a datetime (if input is float) or date (if input is int).
226
+
227
+ Args:
228
+ serial_number (Union[int, float, str, bool]): The Lotus Notes serial number
229
+
230
+ Returns:
231
+ Union[pendulum.DateTime, str, bool]: The converted datetime object, or the original value if conversion fails.
232
+ """
233
+ # To get the seconds passed since the start date of serial numbers we round the product of the number of seconds in a day and the serial number
234
+ conv_datetime: pendulum.DateTime = pendulum.from_timestamp(
235
+ 0, DLT_TIMEZONE
236
+ ) + pendulum.duration(
237
+ seconds=TIMESTAMP_CONST + round(SECONDS_PER_DAY * serial_number)
238
+ )
239
+ # int values are dates, float values are datetimes
240
+ if data_type == "date":
241
+ return conv_datetime.date() # type: ignore[no-any-return]
242
+
243
+ return conv_datetime
244
+
245
+
246
+ def process_range(
247
+ sheet_values: List[List[Any]], headers: List[str], data_types: List[TDataType]
248
+ ) -> Iterator[DictStrAny]:
249
+ """
250
+ Yields lists of values as dictionaries, converts data times and handles empty rows and cells. Please note:
251
+ 1. empty rows get ignored
252
+ 2. empty cells are converted to None (and then to NULL by dlt)
253
+ 3. data in columns without headers will be dropped
254
+
255
+ Args:
256
+ sheet_val (List[List[Any]]): range values without the header row
257
+ headers (List[str]): names of the headers
258
+ data_types: List[TDataType]: "timestamp" and "date" or None for each column
259
+
260
+ Yields:
261
+ DictStrAny: A dictionary version of the table. It generates a dictionary of the type {header: value} for every row.
262
+ """
263
+
264
+ for row in sheet_values:
265
+ # empty row; skip
266
+ if not row:
267
+ continue
268
+ table_dict = {}
269
+ # process both rows and check for differences to spot dates
270
+ for val, header, data_type in zip(row, headers, data_types):
271
+ # 3 main cases: null cell value, datetime value, every other value
272
+ # handle null values properly. Null cell values are returned as empty strings, this will cause dlt to create new columns and fill them with empty strings
273
+ if val == "":
274
+ fill_val = None
275
+ elif data_type in ["timestamp", "date"]:
276
+ # the datetimes are inferred from first row of data. if next rows have inconsistent data types - pass the values to dlt to deal with them
277
+ if not isinstance(val, (int, float)) or isinstance(val, bool):
278
+ fill_val = val
279
+ else:
280
+ fill_val = serial_date_to_datetime(val, data_type)
281
+ else:
282
+ fill_val = val
283
+ table_dict[header] = fill_val
284
+ yield table_dict
285
+
286
+
287
+ def trim_range_top_left(
288
+ parsed_range: ParsedRange, range_values: List[List[Any]]
289
+ ) -> Tuple[ParsedRange, List[List[Any]]]:
290
+ # skip empty rows and then empty columns
291
+ # skip empty rows
292
+ shift_x = 0
293
+ for row in range_values:
294
+ if row:
295
+ break
296
+ else:
297
+ shift_x += 1
298
+ if shift_x > 0:
299
+ range_values = range_values[shift_x:]
300
+ # skip empty columns
301
+ shift_y = 0
302
+ if len(range_values) > 0:
303
+ for col in range_values[0]:
304
+ if col == "":
305
+ shift_y += 1
306
+ else:
307
+ break
308
+ if shift_y > 0:
309
+ # skip all columns
310
+ for idx, row in enumerate(range_values):
311
+ range_values[idx] = row[shift_y:]
312
+ parsed_range = parsed_range._replace(
313
+ start_row=parsed_range.start_row + shift_x,
314
+ start_col=ParsedRange.shift_column(parsed_range.start_col, shift_y),
315
+ )
316
+ return parsed_range, range_values