omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,144 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Defines all the sources and resources needed for Google Analytics V4
17
+ """
18
+
19
+ from typing import Iterator, List, Optional, Union
20
+
21
+ import dlt
22
+ from dlt.common import pendulum
23
+ from dlt.common.typing import DictStrAny, TDataItem
24
+ from dlt.sources import DltResource
25
+ from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
26
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient
27
+ from google.analytics.data_v1beta.types import (
28
+ Dimension,
29
+ Metric,
30
+ MinuteRange,
31
+ )
32
+
33
+ from .helpers import get_realtime_report, get_report
34
+
35
+
36
+ @dlt.source(max_table_nesting=0)
37
+ def google_analytics(
38
+ datetime_dimension: str,
39
+ credentials: Union[
40
+ GcpOAuthCredentials, GcpServiceAccountCredentials
41
+ ] = dlt.secrets.value,
42
+ property_ids: List[str] = dlt.config.value,
43
+ queries: List[DictStrAny] = dlt.config.value,
44
+ start_date: Optional[pendulum.DateTime] = pendulum.datetime(2024, 1, 1),
45
+ end_date: Optional[pendulum.DateTime] = None,
46
+ rows_per_page: int = 10000,
47
+ minute_range_objects: List[MinuteRange] | None = None,
48
+ ) -> List[DltResource]:
49
+ validated_property_ids = []
50
+ for pid in property_ids:
51
+ try:
52
+ int_pid = int(pid)
53
+ except ValueError:
54
+ raise ValueError(
55
+ f"{pid} is an invalid google property id. Please use a numeric id, and not your Measurement ID like G-7F1AE12JLR"
56
+ )
57
+ if int_pid == 0:
58
+ raise ValueError(
59
+ "Google Analytics property id is 0. Did you forget to configure it?"
60
+ )
61
+ validated_property_ids.append(int_pid)
62
+
63
+ if not rows_per_page:
64
+ raise ValueError("Rows per page cannot be 0")
65
+ # generate access token for credentials if we are using OAuth2.0
66
+ if isinstance(credentials, GcpOAuthCredentials):
67
+ credentials.auth("https://www.googleapis.com/auth/analytics.readonly")
68
+
69
+ # Build the service object for Google Analytics api.
70
+ client = BetaAnalyticsDataClient(credentials=credentials.to_native_credentials())
71
+ if len(queries) > 1:
72
+ raise ValueError(
73
+ "Google Analytics supports a single query ingestion at a time, please give only one query"
74
+ )
75
+ query = queries[0]
76
+
77
+ # always add "date" to dimensions so we are able to track the last day of a report
78
+ dimensions = query["dimensions"]
79
+
80
+ @dlt.resource(
81
+ name="custom",
82
+ merge_key=datetime_dimension,
83
+ write_disposition="merge",
84
+ )
85
+ def basic_report(
86
+ incremental=dlt.sources.incremental(
87
+ datetime_dimension,
88
+ initial_value=start_date,
89
+ end_value=end_date,
90
+ range_end="closed",
91
+ range_start="closed",
92
+ ),
93
+ ) -> Iterator[TDataItem]:
94
+ start_date = incremental.last_value
95
+ end_date = incremental.end_value
96
+ if start_date is None:
97
+ start_date = pendulum.datetime(2024, 1, 1)
98
+ if end_date is None:
99
+ end_date = pendulum.yesterday()
100
+ for property_id in validated_property_ids:
101
+ yield from get_report(
102
+ client=client,
103
+ property_id=property_id,
104
+ dimension_list=[Dimension(name=dimension) for dimension in dimensions],
105
+ metric_list=[Metric(name=metric) for metric in query["metrics"]],
106
+ per_page=rows_per_page,
107
+ start_date=start_date,
108
+ end_date=end_date,
109
+ )
110
+
111
+ # real time report
112
+ @dlt.resource(
113
+ name="realtime",
114
+ merge_key="ingested_at",
115
+ write_disposition="merge",
116
+ )
117
+ def real_time_report() -> Iterator[TDataItem]:
118
+ for property_id in validated_property_ids:
119
+ yield from get_realtime_report(
120
+ client=client,
121
+ property_id=property_id,
122
+ dimension_list=[Dimension(name=dimension) for dimension in dimensions],
123
+ metric_list=[Metric(name=metric) for metric in query["metrics"]],
124
+ per_page=rows_per_page,
125
+ minute_range_objects=minute_range_objects,
126
+ )
127
+
128
+ # res = dlt.resource(
129
+ # basic_report, name="basic_report", merge_key=datetime_dimension, write_disposition="merge"
130
+ # )(
131
+ # client=client,
132
+ # rows_per_page=rows_per_page,
133
+ # property_id=property_id,
134
+ # dimensions=dimensions,
135
+ # metrics=query["metrics"],
136
+ # resource_name=resource_name,
137
+ # last_date=dlt.sources.incremental(
138
+ # datetime_dimension,
139
+ # initial_value=start_date,
140
+ # end_value=end_date,
141
+ # ),
142
+ # )
143
+
144
+ return [basic_report, real_time_report]
@@ -0,0 +1,312 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ This module contains helpers that process data and make it ready for loading into the database
17
+ """
18
+
19
+ import base64
20
+ import json
21
+ from typing import Any, Iterator, List, Union
22
+ from urllib.parse import parse_qs, urlparse
23
+
24
+ import proto
25
+ from dlt.common.exceptions import MissingDependencyException
26
+ from dlt.common.pendulum import pendulum
27
+ from dlt.common.typing import DictStrAny, TDataItem, TDataItems
28
+
29
+ try:
30
+ from google.analytics.data_v1beta import BetaAnalyticsDataClient # noqa: F401
31
+ from google.analytics.data_v1beta.types import (
32
+ DateRange,
33
+ Dimension,
34
+ DimensionExpression, # noqa: F401
35
+ DimensionMetadata, # noqa: F401
36
+ GetMetadataRequest, # noqa: F401
37
+ Metadata, # noqa: F401
38
+ Metric,
39
+ MetricMetadata, # noqa: F401
40
+ MetricType,
41
+ MinuteRange,
42
+ RunRealtimeReportRequest,
43
+ RunReportRequest,
44
+ RunReportResponse,
45
+ )
46
+ except ImportError:
47
+ raise MissingDependencyException(
48
+ "Google Analytics API Client", ["google-analytics-data"]
49
+ )
50
+ try:
51
+ from apiclient.discovery import Resource, build # type: ignore # noqa: F401
52
+ except ImportError:
53
+ raise MissingDependencyException("Google API Client", ["google-api-python-client"])
54
+
55
+
56
+ def to_dict(item: Any) -> Iterator[TDataItem]:
57
+ """
58
+ Processes a batch result (page of results per dimension) accordingly
59
+ :param batch:
60
+ :return:
61
+ """
62
+ item = json.loads(
63
+ proto.Message.to_json(
64
+ item,
65
+ preserving_proto_field_name=True,
66
+ use_integers_for_enums=False,
67
+ including_default_value_fields=False,
68
+ )
69
+ )
70
+ yield item
71
+
72
+
73
+ def get_realtime_report(
74
+ client: Resource,
75
+ property_id: int,
76
+ dimension_list: List[Dimension],
77
+ metric_list: List[Metric],
78
+ per_page: int,
79
+ minute_range_objects: List[MinuteRange] | None = None,
80
+ ) -> Iterator[TDataItem]:
81
+ """
82
+ Gets all the possible pages of reports with the given query parameters.
83
+ Processes every page and yields a dictionary for every row of the report.
84
+
85
+ Args:
86
+ client: The Google Analytics client used to make requests.
87
+ property_id: A reference to the Google Analytics project.
88
+ More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
89
+ dimension_list: A list of all the dimensions requested in the query.
90
+ metric_list: A list of all the metrics requested in the query.
91
+ limit: Describes how many rows there should be per page.
92
+
93
+ Yields:
94
+ Generator of all rows of data in the report.
95
+ """
96
+ offset = 0
97
+ ingest_at = pendulum.now().to_date_string()
98
+
99
+ while True:
100
+ request = RunRealtimeReportRequest(
101
+ property=f"properties/{property_id}",
102
+ dimensions=dimension_list,
103
+ metrics=metric_list,
104
+ limit=per_page,
105
+ minute_ranges=minute_range_objects if minute_range_objects else None,
106
+ )
107
+ response = client.run_realtime_report(request)
108
+
109
+ # process request
110
+ processed_response_generator = process_report(
111
+ response=response, ingest_at=ingest_at
112
+ )
113
+ # import pdb; pdb.set_trace()
114
+ for row in processed_response_generator:
115
+ row["property_id"] = str(property_id)
116
+ yield row
117
+ offset += per_page
118
+ if len(response.rows) < per_page or offset > 1000000:
119
+ break
120
+
121
+
122
+ def get_report(
123
+ client: Resource,
124
+ property_id: int,
125
+ dimension_list: List[Dimension],
126
+ metric_list: List[Metric],
127
+ per_page: int,
128
+ start_date: pendulum.DateTime,
129
+ end_date: pendulum.DateTime,
130
+ ) -> Iterator[TDataItem]:
131
+ """
132
+ Gets all the possible pages of reports with the given query parameters.
133
+ Processes every page and yields a dictionary for every row of the report.
134
+
135
+ Args:
136
+ client: The Google Analytics client used to make requests.
137
+ property_id: A reference to the Google Analytics project.
138
+ More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
139
+ dimension_list: A list of all the dimensions requested in the query.
140
+ metric_list: A list of all the metrics requested in the query.
141
+ limit: Describes how many rows there should be per page.
142
+ start_date: The starting date of the query.
143
+ end_date: The ending date of the query.
144
+
145
+ Yields:
146
+ Generator of all rows of data in the report.
147
+ """
148
+
149
+ offset = 0
150
+ while True:
151
+ request = RunReportRequest(
152
+ property=f"properties/{property_id}",
153
+ dimensions=dimension_list,
154
+ metrics=metric_list,
155
+ limit=per_page,
156
+ offset=offset,
157
+ date_ranges=[
158
+ DateRange(
159
+ start_date=start_date.to_date_string(),
160
+ end_date=end_date.to_date_string(),
161
+ )
162
+ ],
163
+ )
164
+ response = client.run_report(request)
165
+
166
+ # process request
167
+ processed_response_generator = process_report(response=response)
168
+
169
+ # import pdb; pdb.set_trace()
170
+ for row in processed_response_generator:
171
+ row["property_id"] = str(property_id)
172
+ yield row
173
+ offset += per_page
174
+ if len(response.rows) < per_page or offset > 1000000:
175
+ break
176
+
177
+
178
+ def process_report(
179
+ response: RunReportResponse, ingest_at: str | None = None
180
+ ) -> Iterator[TDataItems]:
181
+ metrics_headers = [header.name for header in response.metric_headers]
182
+ dimensions_headers = [header.name for header in response.dimension_headers]
183
+
184
+ distinct_key_combinations = {}
185
+
186
+ for row in response.rows:
187
+ response_dict: DictStrAny = {
188
+ dimension_header: _resolve_dimension_value(
189
+ dimension_header, dimension_value.value
190
+ )
191
+ for dimension_header, dimension_value in zip(
192
+ dimensions_headers, row.dimension_values
193
+ )
194
+ }
195
+
196
+ for i in range(len(metrics_headers)):
197
+ # get metric type and process the value depending on type. Save metric name including type as well for the columns
198
+ metric_type = response.metric_headers[i].type_
199
+ metric_value = process_metric_value(
200
+ metric_type=metric_type, value=row.metric_values[i].value
201
+ )
202
+ response_dict[metrics_headers[i]] = metric_value
203
+ if ingest_at is not None:
204
+ response_dict["ingested_at"] = ingest_at
205
+
206
+ unique_key = "-".join(list(response_dict.keys()))
207
+ if unique_key not in distinct_key_combinations:
208
+ distinct_key_combinations[unique_key] = True
209
+
210
+ yield response_dict
211
+
212
+
213
+ def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int, float]:
214
+ """
215
+ Processes the metric type, converts it from string to the correct type, and returns it.
216
+
217
+ Args:
218
+ metric_type: The type of the metric.
219
+ value: The value of the metric as a string.
220
+
221
+ Returns:
222
+ The given value converted to the correct data type.
223
+ """
224
+
225
+ # So far according to GA4 documentation these are the correct types: https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/MetricType
226
+ # 0 for strings, 1 for ints and 2-12 are different types of floating points.
227
+ if metric_type.value == 0:
228
+ return value
229
+ elif metric_type.value == 1:
230
+ return int(value)
231
+ else:
232
+ return float(value)
233
+
234
+
235
+ def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
236
+ if dimension_name == "date":
237
+ return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
238
+ elif dimension_name == "dateHour":
239
+ return pendulum.from_format(dimension_value, "YYYYMMDDHH", tz="UTC")
240
+ elif dimension_name == "dateHourMinute":
241
+ return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
242
+ else:
243
+ return dimension_value
244
+
245
+
246
+ def convert_minutes_ranges_to_minute_range_objects(
247
+ minutes_ranges: str,
248
+ ) -> List[MinuteRange]:
249
+ minutes_ranges = minutes_ranges.strip()
250
+ minutes = minutes_ranges.replace(" ", "").split(",")
251
+ if minutes == "":
252
+ raise ValueError(
253
+ "Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
254
+ )
255
+
256
+ minute_range_objects = []
257
+ for min_range in minutes:
258
+ if "-" not in min_range:
259
+ raise ValueError(
260
+ "Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
261
+ )
262
+ parts = min_range.split("-")
263
+
264
+ if not parts[0].isdigit() or not parts[1].isdigit():
265
+ raise ValueError(
266
+ f"Invalid input '{min_range}'. Both start and end minutes must be digits. For example: 1-2,5-6"
267
+ )
268
+
269
+ end_minutes_ago = int(parts[0])
270
+ start_minutes_ago = int(parts[1])
271
+ minute_range_objects.append(
272
+ MinuteRange(
273
+ name=f"{end_minutes_ago}-{start_minutes_ago} minutes ago",
274
+ start_minutes_ago=start_minutes_ago,
275
+ end_minutes_ago=end_minutes_ago,
276
+ )
277
+ )
278
+
279
+ return minute_range_objects
280
+
281
+
282
+ def parse_google_analytics_uri(uri: str):
283
+ parse_uri = urlparse(uri)
284
+ source_fields = parse_qs(parse_uri.query)
285
+ cred_path = source_fields.get("credentials_path")
286
+ cred_base64 = source_fields.get("credentials_base64")
287
+
288
+ if not cred_path and not cred_base64:
289
+ raise ValueError(
290
+ "credentials_path or credentials_base64 is required to connect Google Analytics"
291
+ )
292
+ credentials = {}
293
+ if cred_path:
294
+ with open(cred_path[0], "r") as f:
295
+ credentials = json.load(f)
296
+ elif cred_base64:
297
+ credentials = json.loads(base64.b64decode(cred_base64[0]).decode("utf-8"))
298
+
299
+ property_id = source_fields.get("property_id")
300
+ if not property_id:
301
+ raise ValueError("property_id is required to connect to Google Analytics")
302
+
303
+ if (not cred_path and not cred_base64) or (not property_id):
304
+ raise ValueError(
305
+ "credentials_path or credentials_base64 and property_id are required to connect Google Analytics"
306
+ )
307
+
308
+ property_ids = [pid.strip() for pid in property_id[0].split(",") if pid.strip()]
309
+ if not property_ids:
310
+ raise ValueError("property_id is required to connect to Google Analytics")
311
+
312
+ return {"credentials": credentials, "property_ids": property_ids}
@@ -0,0 +1,95 @@
1
+ # Google Sheets
2
+
3
+ ## Prepare your data
4
+
5
+ We recommend to to use [Named Ranges](link to gsheets) to indicate which data should be extracted from a particular spreadsheet and this is how this source
6
+ will work by default - when called with without setting any other options. All the named ranges will be converted into tables named after them and stored in the
7
+ destination.
8
+ * You can let the spreadsheet users to add and remove tables by just adding/removing the ranges, you do not need to configure the pipeline again.
9
+ * You can indicate exactly the fragments of interest and only this data will be retrieved so it is the fastest.
10
+ * You can name database tables by changing the range names.
11
+
12
+ If you are not happy with the workflow above, you can:
13
+ * Disable it by setting `get_named_ranges` option to False
14
+ * Enable retrieving all sheets/tabs with `get_sheets` option set to True
15
+ * Pass a list of ranges as supported by Google Sheets in `range_names`
16
+
17
+ Note that hidden columns will be extracted.
18
+
19
+ > 💡 You can load data from many spreadsheets and also rename the tables to which data is loaded. This is standard part of `dlt`, see `load_with_table_rename_and_multiple_spreadsheets` demo in `google_sheets_pipeline.py`
20
+
21
+ ### Make sure your data has headers and is a proper table
22
+ **First row of any extracted range should contain headers**. Please make sure:
23
+ 1. The header names are strings and are unique.
24
+ 2. That all the columns that you intend to extract have a header.
25
+ 3. That data starts exactly at the origin of the range - otherwise source will remove padding but it is a waste of resources!
26
+
27
+ When source detects any problems with headers or table layout **it will issue a WARNING in the log** so it makes sense to run your pipeline script manually/locally and fix all the problems.
28
+ 1. Columns without headers will be removed and not extracted!
29
+ 2. Columns with headers that does not contain any data will be removed.
30
+ 2. If there's any problems with reading headers (ie. header is not string or is empty or not unique): **the headers row will be extracted as data** and automatic header names will be used.
31
+ 3. Empty rows are ignored
32
+ 4. `dlt` will normalize range names and headers into table and column names - so they may be different in the database than in google sheets. Prefer small cap names without special characters!
33
+
34
+ ### Data Types
35
+ `dlt` normalizer will use first row of data to infer types and will try to coerce following rows - creating variant columns if that is not possible. This is a standard behavior.
36
+ **date time** and **date** types are also recognized and this happens via additional metadata that is retrieved for the first row.
37
+
38
+ ## Passing the spreadsheet id/url and explicit range names
39
+ You can use both url of your spreadsheet that you can copy from the browser ie.
40
+ ```
41
+ https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing
42
+ ```
43
+ or spreadsheet id (which is a part of the url)
44
+ ```
45
+ 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4
46
+ ```
47
+ typically you pass it directly to the `google_spreadsheet` function
48
+
49
+ **passing ranges**
50
+
51
+ You can pass explicit ranges to the `google_spreadsheet`:
52
+ 1. sheet names
53
+ 2. named ranges
54
+ 3. any range in Google Sheet format ie. **sheet 1!A1:B7**
55
+
56
+
57
+ ## The `spreadsheet_info` table
58
+ This table is repopulated after every load and keeps the information on loaded ranges:
59
+ * id and title of the spreadsheet
60
+ * name of the range as passed to the source
61
+ * string representation of the loaded range
62
+ * range above in parsed representation
63
+
64
+ ## Running on Airflow (and some under the hood information)
65
+ Internally, the source loads all the data immediately in the `google_spreadsheet` before execution of the pipeline in `run`. No matter how many ranges you request, we make just two calls to the API to retrieve data. This works very well with typical scripts that create a dlt source with `google_spreadsheet` and then run it with `pipeline.run`.
66
+
67
+ In case of Airflow, the source is created and executed separately. In typical configuration where runner is a separate machine, **this will load data twice**.
68
+
69
+ **Moreover, you should not use `scc` decomposition in our Airflow helper**. It will create an instance of the source for each requested range in order to run a task that corresponds to it! Following our [Airflow deployment guide](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file), this is how you should use `tasks.add_run` on `PipelineTasksGroup`:
70
+ ```python
71
+ @dag(
72
+ schedule_interval='@daily',
73
+ start_date=pendulum.datetime(2023, 2, 1),
74
+ catchup=False,
75
+ max_active_runs=1,
76
+ default_args=default_task_args
77
+ )
78
+ def get_named_ranges():
79
+ tasks = PipelineTasksGroup("get_named_ranges", use_data_folder=False, wipe_local_data=True)
80
+
81
+ # import your source from pipeline script
82
+ from google_sheets import google_spreadsheet
83
+
84
+ pipeline = dlt.pipeline(
85
+ pipeline_name="get_named_ranges",
86
+ dataset_name="named_ranges_data",
87
+ destination='bigquery',
88
+ )
89
+
90
+ # do not use decompose to run `google_spreadsheet` in single task
91
+ tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True)
92
+ ```
93
+
94
+ ## Setup credentials
95
+ [We recommend to use service account for any production deployments](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#google-sheets-api-authentication)