omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Defines all the sources and resources needed for Google Analytics V4
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from typing import Iterator, List, Optional, Union
|
|
20
|
+
|
|
21
|
+
import dlt
|
|
22
|
+
from dlt.common import pendulum
|
|
23
|
+
from dlt.common.typing import DictStrAny, TDataItem
|
|
24
|
+
from dlt.sources import DltResource
|
|
25
|
+
from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
26
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
27
|
+
from google.analytics.data_v1beta.types import (
|
|
28
|
+
Dimension,
|
|
29
|
+
Metric,
|
|
30
|
+
MinuteRange,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
from .helpers import get_realtime_report, get_report
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dlt.source(max_table_nesting=0)
|
|
37
|
+
def google_analytics(
|
|
38
|
+
datetime_dimension: str,
|
|
39
|
+
credentials: Union[
|
|
40
|
+
GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
41
|
+
] = dlt.secrets.value,
|
|
42
|
+
property_ids: List[str] = dlt.config.value,
|
|
43
|
+
queries: List[DictStrAny] = dlt.config.value,
|
|
44
|
+
start_date: Optional[pendulum.DateTime] = pendulum.datetime(2024, 1, 1),
|
|
45
|
+
end_date: Optional[pendulum.DateTime] = None,
|
|
46
|
+
rows_per_page: int = 10000,
|
|
47
|
+
minute_range_objects: List[MinuteRange] | None = None,
|
|
48
|
+
) -> List[DltResource]:
|
|
49
|
+
validated_property_ids = []
|
|
50
|
+
for pid in property_ids:
|
|
51
|
+
try:
|
|
52
|
+
int_pid = int(pid)
|
|
53
|
+
except ValueError:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"{pid} is an invalid google property id. Please use a numeric id, and not your Measurement ID like G-7F1AE12JLR"
|
|
56
|
+
)
|
|
57
|
+
if int_pid == 0:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
"Google Analytics property id is 0. Did you forget to configure it?"
|
|
60
|
+
)
|
|
61
|
+
validated_property_ids.append(int_pid)
|
|
62
|
+
|
|
63
|
+
if not rows_per_page:
|
|
64
|
+
raise ValueError("Rows per page cannot be 0")
|
|
65
|
+
# generate access token for credentials if we are using OAuth2.0
|
|
66
|
+
if isinstance(credentials, GcpOAuthCredentials):
|
|
67
|
+
credentials.auth("https://www.googleapis.com/auth/analytics.readonly")
|
|
68
|
+
|
|
69
|
+
# Build the service object for Google Analytics api.
|
|
70
|
+
client = BetaAnalyticsDataClient(credentials=credentials.to_native_credentials())
|
|
71
|
+
if len(queries) > 1:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
"Google Analytics supports a single query ingestion at a time, please give only one query"
|
|
74
|
+
)
|
|
75
|
+
query = queries[0]
|
|
76
|
+
|
|
77
|
+
# always add "date" to dimensions so we are able to track the last day of a report
|
|
78
|
+
dimensions = query["dimensions"]
|
|
79
|
+
|
|
80
|
+
@dlt.resource(
|
|
81
|
+
name="custom",
|
|
82
|
+
merge_key=datetime_dimension,
|
|
83
|
+
write_disposition="merge",
|
|
84
|
+
)
|
|
85
|
+
def basic_report(
|
|
86
|
+
incremental=dlt.sources.incremental(
|
|
87
|
+
datetime_dimension,
|
|
88
|
+
initial_value=start_date,
|
|
89
|
+
end_value=end_date,
|
|
90
|
+
range_end="closed",
|
|
91
|
+
range_start="closed",
|
|
92
|
+
),
|
|
93
|
+
) -> Iterator[TDataItem]:
|
|
94
|
+
start_date = incremental.last_value
|
|
95
|
+
end_date = incremental.end_value
|
|
96
|
+
if start_date is None:
|
|
97
|
+
start_date = pendulum.datetime(2024, 1, 1)
|
|
98
|
+
if end_date is None:
|
|
99
|
+
end_date = pendulum.yesterday()
|
|
100
|
+
for property_id in validated_property_ids:
|
|
101
|
+
yield from get_report(
|
|
102
|
+
client=client,
|
|
103
|
+
property_id=property_id,
|
|
104
|
+
dimension_list=[Dimension(name=dimension) for dimension in dimensions],
|
|
105
|
+
metric_list=[Metric(name=metric) for metric in query["metrics"]],
|
|
106
|
+
per_page=rows_per_page,
|
|
107
|
+
start_date=start_date,
|
|
108
|
+
end_date=end_date,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# real time report
|
|
112
|
+
@dlt.resource(
|
|
113
|
+
name="realtime",
|
|
114
|
+
merge_key="ingested_at",
|
|
115
|
+
write_disposition="merge",
|
|
116
|
+
)
|
|
117
|
+
def real_time_report() -> Iterator[TDataItem]:
|
|
118
|
+
for property_id in validated_property_ids:
|
|
119
|
+
yield from get_realtime_report(
|
|
120
|
+
client=client,
|
|
121
|
+
property_id=property_id,
|
|
122
|
+
dimension_list=[Dimension(name=dimension) for dimension in dimensions],
|
|
123
|
+
metric_list=[Metric(name=metric) for metric in query["metrics"]],
|
|
124
|
+
per_page=rows_per_page,
|
|
125
|
+
minute_range_objects=minute_range_objects,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# res = dlt.resource(
|
|
129
|
+
# basic_report, name="basic_report", merge_key=datetime_dimension, write_disposition="merge"
|
|
130
|
+
# )(
|
|
131
|
+
# client=client,
|
|
132
|
+
# rows_per_page=rows_per_page,
|
|
133
|
+
# property_id=property_id,
|
|
134
|
+
# dimensions=dimensions,
|
|
135
|
+
# metrics=query["metrics"],
|
|
136
|
+
# resource_name=resource_name,
|
|
137
|
+
# last_date=dlt.sources.incremental(
|
|
138
|
+
# datetime_dimension,
|
|
139
|
+
# initial_value=start_date,
|
|
140
|
+
# end_value=end_date,
|
|
141
|
+
# ),
|
|
142
|
+
# )
|
|
143
|
+
|
|
144
|
+
return [basic_report, real_time_report]
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
This module contains helpers that process data and make it ready for loading into the database
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import base64
|
|
20
|
+
import json
|
|
21
|
+
from typing import Any, Iterator, List, Union
|
|
22
|
+
from urllib.parse import parse_qs, urlparse
|
|
23
|
+
|
|
24
|
+
import proto
|
|
25
|
+
from dlt.common.exceptions import MissingDependencyException
|
|
26
|
+
from dlt.common.pendulum import pendulum
|
|
27
|
+
from dlt.common.typing import DictStrAny, TDataItem, TDataItems
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from google.analytics.data_v1beta import BetaAnalyticsDataClient # noqa: F401
|
|
31
|
+
from google.analytics.data_v1beta.types import (
|
|
32
|
+
DateRange,
|
|
33
|
+
Dimension,
|
|
34
|
+
DimensionExpression, # noqa: F401
|
|
35
|
+
DimensionMetadata, # noqa: F401
|
|
36
|
+
GetMetadataRequest, # noqa: F401
|
|
37
|
+
Metadata, # noqa: F401
|
|
38
|
+
Metric,
|
|
39
|
+
MetricMetadata, # noqa: F401
|
|
40
|
+
MetricType,
|
|
41
|
+
MinuteRange,
|
|
42
|
+
RunRealtimeReportRequest,
|
|
43
|
+
RunReportRequest,
|
|
44
|
+
RunReportResponse,
|
|
45
|
+
)
|
|
46
|
+
except ImportError:
|
|
47
|
+
raise MissingDependencyException(
|
|
48
|
+
"Google Analytics API Client", ["google-analytics-data"]
|
|
49
|
+
)
|
|
50
|
+
try:
|
|
51
|
+
from apiclient.discovery import Resource, build # type: ignore # noqa: F401
|
|
52
|
+
except ImportError:
|
|
53
|
+
raise MissingDependencyException("Google API Client", ["google-api-python-client"])
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def to_dict(item: Any) -> Iterator[TDataItem]:
|
|
57
|
+
"""
|
|
58
|
+
Processes a batch result (page of results per dimension) accordingly
|
|
59
|
+
:param batch:
|
|
60
|
+
:return:
|
|
61
|
+
"""
|
|
62
|
+
item = json.loads(
|
|
63
|
+
proto.Message.to_json(
|
|
64
|
+
item,
|
|
65
|
+
preserving_proto_field_name=True,
|
|
66
|
+
use_integers_for_enums=False,
|
|
67
|
+
including_default_value_fields=False,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
yield item
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_realtime_report(
|
|
74
|
+
client: Resource,
|
|
75
|
+
property_id: int,
|
|
76
|
+
dimension_list: List[Dimension],
|
|
77
|
+
metric_list: List[Metric],
|
|
78
|
+
per_page: int,
|
|
79
|
+
minute_range_objects: List[MinuteRange] | None = None,
|
|
80
|
+
) -> Iterator[TDataItem]:
|
|
81
|
+
"""
|
|
82
|
+
Gets all the possible pages of reports with the given query parameters.
|
|
83
|
+
Processes every page and yields a dictionary for every row of the report.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
client: The Google Analytics client used to make requests.
|
|
87
|
+
property_id: A reference to the Google Analytics project.
|
|
88
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
89
|
+
dimension_list: A list of all the dimensions requested in the query.
|
|
90
|
+
metric_list: A list of all the metrics requested in the query.
|
|
91
|
+
limit: Describes how many rows there should be per page.
|
|
92
|
+
|
|
93
|
+
Yields:
|
|
94
|
+
Generator of all rows of data in the report.
|
|
95
|
+
"""
|
|
96
|
+
offset = 0
|
|
97
|
+
ingest_at = pendulum.now().to_date_string()
|
|
98
|
+
|
|
99
|
+
while True:
|
|
100
|
+
request = RunRealtimeReportRequest(
|
|
101
|
+
property=f"properties/{property_id}",
|
|
102
|
+
dimensions=dimension_list,
|
|
103
|
+
metrics=metric_list,
|
|
104
|
+
limit=per_page,
|
|
105
|
+
minute_ranges=minute_range_objects if minute_range_objects else None,
|
|
106
|
+
)
|
|
107
|
+
response = client.run_realtime_report(request)
|
|
108
|
+
|
|
109
|
+
# process request
|
|
110
|
+
processed_response_generator = process_report(
|
|
111
|
+
response=response, ingest_at=ingest_at
|
|
112
|
+
)
|
|
113
|
+
# import pdb; pdb.set_trace()
|
|
114
|
+
for row in processed_response_generator:
|
|
115
|
+
row["property_id"] = str(property_id)
|
|
116
|
+
yield row
|
|
117
|
+
offset += per_page
|
|
118
|
+
if len(response.rows) < per_page or offset > 1000000:
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_report(
|
|
123
|
+
client: Resource,
|
|
124
|
+
property_id: int,
|
|
125
|
+
dimension_list: List[Dimension],
|
|
126
|
+
metric_list: List[Metric],
|
|
127
|
+
per_page: int,
|
|
128
|
+
start_date: pendulum.DateTime,
|
|
129
|
+
end_date: pendulum.DateTime,
|
|
130
|
+
) -> Iterator[TDataItem]:
|
|
131
|
+
"""
|
|
132
|
+
Gets all the possible pages of reports with the given query parameters.
|
|
133
|
+
Processes every page and yields a dictionary for every row of the report.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
client: The Google Analytics client used to make requests.
|
|
137
|
+
property_id: A reference to the Google Analytics project.
|
|
138
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
139
|
+
dimension_list: A list of all the dimensions requested in the query.
|
|
140
|
+
metric_list: A list of all the metrics requested in the query.
|
|
141
|
+
limit: Describes how many rows there should be per page.
|
|
142
|
+
start_date: The starting date of the query.
|
|
143
|
+
end_date: The ending date of the query.
|
|
144
|
+
|
|
145
|
+
Yields:
|
|
146
|
+
Generator of all rows of data in the report.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
offset = 0
|
|
150
|
+
while True:
|
|
151
|
+
request = RunReportRequest(
|
|
152
|
+
property=f"properties/{property_id}",
|
|
153
|
+
dimensions=dimension_list,
|
|
154
|
+
metrics=metric_list,
|
|
155
|
+
limit=per_page,
|
|
156
|
+
offset=offset,
|
|
157
|
+
date_ranges=[
|
|
158
|
+
DateRange(
|
|
159
|
+
start_date=start_date.to_date_string(),
|
|
160
|
+
end_date=end_date.to_date_string(),
|
|
161
|
+
)
|
|
162
|
+
],
|
|
163
|
+
)
|
|
164
|
+
response = client.run_report(request)
|
|
165
|
+
|
|
166
|
+
# process request
|
|
167
|
+
processed_response_generator = process_report(response=response)
|
|
168
|
+
|
|
169
|
+
# import pdb; pdb.set_trace()
|
|
170
|
+
for row in processed_response_generator:
|
|
171
|
+
row["property_id"] = str(property_id)
|
|
172
|
+
yield row
|
|
173
|
+
offset += per_page
|
|
174
|
+
if len(response.rows) < per_page or offset > 1000000:
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def process_report(
|
|
179
|
+
response: RunReportResponse, ingest_at: str | None = None
|
|
180
|
+
) -> Iterator[TDataItems]:
|
|
181
|
+
metrics_headers = [header.name for header in response.metric_headers]
|
|
182
|
+
dimensions_headers = [header.name for header in response.dimension_headers]
|
|
183
|
+
|
|
184
|
+
distinct_key_combinations = {}
|
|
185
|
+
|
|
186
|
+
for row in response.rows:
|
|
187
|
+
response_dict: DictStrAny = {
|
|
188
|
+
dimension_header: _resolve_dimension_value(
|
|
189
|
+
dimension_header, dimension_value.value
|
|
190
|
+
)
|
|
191
|
+
for dimension_header, dimension_value in zip(
|
|
192
|
+
dimensions_headers, row.dimension_values
|
|
193
|
+
)
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
for i in range(len(metrics_headers)):
|
|
197
|
+
# get metric type and process the value depending on type. Save metric name including type as well for the columns
|
|
198
|
+
metric_type = response.metric_headers[i].type_
|
|
199
|
+
metric_value = process_metric_value(
|
|
200
|
+
metric_type=metric_type, value=row.metric_values[i].value
|
|
201
|
+
)
|
|
202
|
+
response_dict[metrics_headers[i]] = metric_value
|
|
203
|
+
if ingest_at is not None:
|
|
204
|
+
response_dict["ingested_at"] = ingest_at
|
|
205
|
+
|
|
206
|
+
unique_key = "-".join(list(response_dict.keys()))
|
|
207
|
+
if unique_key not in distinct_key_combinations:
|
|
208
|
+
distinct_key_combinations[unique_key] = True
|
|
209
|
+
|
|
210
|
+
yield response_dict
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def process_metric_value(metric_type: MetricType, value: str) -> Union[str, int, float]:
|
|
214
|
+
"""
|
|
215
|
+
Processes the metric type, converts it from string to the correct type, and returns it.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
metric_type: The type of the metric.
|
|
219
|
+
value: The value of the metric as a string.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
The given value converted to the correct data type.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
# So far according to GA4 documentation these are the correct types: https://developers.google.com/analytics/devguides/reporting/data/v1/rest/v1beta/MetricType
|
|
226
|
+
# 0 for strings, 1 for ints and 2-12 are different types of floating points.
|
|
227
|
+
if metric_type.value == 0:
|
|
228
|
+
return value
|
|
229
|
+
elif metric_type.value == 1:
|
|
230
|
+
return int(value)
|
|
231
|
+
else:
|
|
232
|
+
return float(value)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
|
|
236
|
+
if dimension_name == "date":
|
|
237
|
+
return pendulum.from_format(dimension_value, "YYYYMMDD", tz="UTC")
|
|
238
|
+
elif dimension_name == "dateHour":
|
|
239
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHH", tz="UTC")
|
|
240
|
+
elif dimension_name == "dateHourMinute":
|
|
241
|
+
return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
|
|
242
|
+
else:
|
|
243
|
+
return dimension_value
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def convert_minutes_ranges_to_minute_range_objects(
|
|
247
|
+
minutes_ranges: str,
|
|
248
|
+
) -> List[MinuteRange]:
|
|
249
|
+
minutes_ranges = minutes_ranges.strip()
|
|
250
|
+
minutes = minutes_ranges.replace(" ", "").split(",")
|
|
251
|
+
if minutes == "":
|
|
252
|
+
raise ValueError(
|
|
253
|
+
"Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
minute_range_objects = []
|
|
257
|
+
for min_range in minutes:
|
|
258
|
+
if "-" not in min_range:
|
|
259
|
+
raise ValueError(
|
|
260
|
+
"Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
|
|
261
|
+
)
|
|
262
|
+
parts = min_range.split("-")
|
|
263
|
+
|
|
264
|
+
if not parts[0].isdigit() or not parts[1].isdigit():
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"Invalid input '{min_range}'. Both start and end minutes must be digits. For example: 1-2,5-6"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
end_minutes_ago = int(parts[0])
|
|
270
|
+
start_minutes_ago = int(parts[1])
|
|
271
|
+
minute_range_objects.append(
|
|
272
|
+
MinuteRange(
|
|
273
|
+
name=f"{end_minutes_ago}-{start_minutes_ago} minutes ago",
|
|
274
|
+
start_minutes_ago=start_minutes_ago,
|
|
275
|
+
end_minutes_ago=end_minutes_ago,
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return minute_range_objects
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def parse_google_analytics_uri(uri: str):
|
|
283
|
+
parse_uri = urlparse(uri)
|
|
284
|
+
source_fields = parse_qs(parse_uri.query)
|
|
285
|
+
cred_path = source_fields.get("credentials_path")
|
|
286
|
+
cred_base64 = source_fields.get("credentials_base64")
|
|
287
|
+
|
|
288
|
+
if not cred_path and not cred_base64:
|
|
289
|
+
raise ValueError(
|
|
290
|
+
"credentials_path or credentials_base64 is required to connect Google Analytics"
|
|
291
|
+
)
|
|
292
|
+
credentials = {}
|
|
293
|
+
if cred_path:
|
|
294
|
+
with open(cred_path[0], "r") as f:
|
|
295
|
+
credentials = json.load(f)
|
|
296
|
+
elif cred_base64:
|
|
297
|
+
credentials = json.loads(base64.b64decode(cred_base64[0]).decode("utf-8"))
|
|
298
|
+
|
|
299
|
+
property_id = source_fields.get("property_id")
|
|
300
|
+
if not property_id:
|
|
301
|
+
raise ValueError("property_id is required to connect to Google Analytics")
|
|
302
|
+
|
|
303
|
+
if (not cred_path and not cred_base64) or (not property_id):
|
|
304
|
+
raise ValueError(
|
|
305
|
+
"credentials_path or credentials_base64 and property_id are required to connect Google Analytics"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
property_ids = [pid.strip() for pid in property_id[0].split(",") if pid.strip()]
|
|
309
|
+
if not property_ids:
|
|
310
|
+
raise ValueError("property_id is required to connect to Google Analytics")
|
|
311
|
+
|
|
312
|
+
return {"credentials": credentials, "property_ids": property_ids}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Google Sheets
|
|
2
|
+
|
|
3
|
+
## Prepare your data
|
|
4
|
+
|
|
5
|
+
We recommend to to use [Named Ranges](link to gsheets) to indicate which data should be extracted from a particular spreadsheet and this is how this source
|
|
6
|
+
will work by default - when called with without setting any other options. All the named ranges will be converted into tables named after them and stored in the
|
|
7
|
+
destination.
|
|
8
|
+
* You can let the spreadsheet users to add and remove tables by just adding/removing the ranges, you do not need to configure the pipeline again.
|
|
9
|
+
* You can indicate exactly the fragments of interest and only this data will be retrieved so it is the fastest.
|
|
10
|
+
* You can name database tables by changing the range names.
|
|
11
|
+
|
|
12
|
+
If you are not happy with the workflow above, you can:
|
|
13
|
+
* Disable it by setting `get_named_ranges` option to False
|
|
14
|
+
* Enable retrieving all sheets/tabs with `get_sheets` option set to True
|
|
15
|
+
* Pass a list of ranges as supported by Google Sheets in `range_names`
|
|
16
|
+
|
|
17
|
+
Note that hidden columns will be extracted.
|
|
18
|
+
|
|
19
|
+
> 💡 You can load data from many spreadsheets and also rename the tables to which data is loaded. This is standard part of `dlt`, see `load_with_table_rename_and_multiple_spreadsheets` demo in `google_sheets_pipeline.py`
|
|
20
|
+
|
|
21
|
+
### Make sure your data has headers and is a proper table
|
|
22
|
+
**First row of any extracted range should contain headers**. Please make sure:
|
|
23
|
+
1. The header names are strings and are unique.
|
|
24
|
+
2. That all the columns that you intend to extract have a header.
|
|
25
|
+
3. That data starts exactly at the origin of the range - otherwise source will remove padding but it is a waste of resources!
|
|
26
|
+
|
|
27
|
+
When source detects any problems with headers or table layout **it will issue a WARNING in the log** so it makes sense to run your pipeline script manually/locally and fix all the problems.
|
|
28
|
+
1. Columns without headers will be removed and not extracted!
|
|
29
|
+
2. Columns with headers that does not contain any data will be removed.
|
|
30
|
+
2. If there's any problems with reading headers (ie. header is not string or is empty or not unique): **the headers row will be extracted as data** and automatic header names will be used.
|
|
31
|
+
3. Empty rows are ignored
|
|
32
|
+
4. `dlt` will normalize range names and headers into table and column names - so they may be different in the database than in google sheets. Prefer small cap names without special characters!
|
|
33
|
+
|
|
34
|
+
### Data Types
|
|
35
|
+
`dlt` normalizer will use first row of data to infer types and will try to coerce following rows - creating variant columns if that is not possible. This is a standard behavior.
|
|
36
|
+
**date time** and **date** types are also recognized and this happens via additional metadata that is retrieved for the first row.
|
|
37
|
+
|
|
38
|
+
## Passing the spreadsheet id/url and explicit range names
|
|
39
|
+
You can use both url of your spreadsheet that you can copy from the browser ie.
|
|
40
|
+
```
|
|
41
|
+
https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing
|
|
42
|
+
```
|
|
43
|
+
or spreadsheet id (which is a part of the url)
|
|
44
|
+
```
|
|
45
|
+
1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4
|
|
46
|
+
```
|
|
47
|
+
typically you pass it directly to the `google_spreadsheet` function
|
|
48
|
+
|
|
49
|
+
**passing ranges**
|
|
50
|
+
|
|
51
|
+
You can pass explicit ranges to the `google_spreadsheet`:
|
|
52
|
+
1. sheet names
|
|
53
|
+
2. named ranges
|
|
54
|
+
3. any range in Google Sheet format ie. **sheet 1!A1:B7**
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
## The `spreadsheet_info` table
|
|
58
|
+
This table is repopulated after every load and keeps the information on loaded ranges:
|
|
59
|
+
* id and title of the spreadsheet
|
|
60
|
+
* name of the range as passed to the source
|
|
61
|
+
* string representation of the loaded range
|
|
62
|
+
* range above in parsed representation
|
|
63
|
+
|
|
64
|
+
## Running on Airflow (and some under the hood information)
|
|
65
|
+
Internally, the source loads all the data immediately in the `google_spreadsheet` before execution of the pipeline in `run`. No matter how many ranges you request, we make just two calls to the API to retrieve data. This works very well with typical scripts that create a dlt source with `google_spreadsheet` and then run it with `pipeline.run`.
|
|
66
|
+
|
|
67
|
+
In case of Airflow, the source is created and executed separately. In typical configuration where runner is a separate machine, **this will load data twice**.
|
|
68
|
+
|
|
69
|
+
**Moreover, you should not use `scc` decomposition in our Airflow helper**. It will create an instance of the source for each requested range in order to run a task that corresponds to it! Following our [Airflow deployment guide](https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file), this is how you should use `tasks.add_run` on `PipelineTasksGroup`:
|
|
70
|
+
```python
|
|
71
|
+
@dag(
|
|
72
|
+
schedule_interval='@daily',
|
|
73
|
+
start_date=pendulum.datetime(2023, 2, 1),
|
|
74
|
+
catchup=False,
|
|
75
|
+
max_active_runs=1,
|
|
76
|
+
default_args=default_task_args
|
|
77
|
+
)
|
|
78
|
+
def get_named_ranges():
|
|
79
|
+
tasks = PipelineTasksGroup("get_named_ranges", use_data_folder=False, wipe_local_data=True)
|
|
80
|
+
|
|
81
|
+
# import your source from pipeline script
|
|
82
|
+
from google_sheets import google_spreadsheet
|
|
83
|
+
|
|
84
|
+
pipeline = dlt.pipeline(
|
|
85
|
+
pipeline_name="get_named_ranges",
|
|
86
|
+
dataset_name="named_ranges_data",
|
|
87
|
+
destination='bigquery',
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# do not use decompose to run `google_spreadsheet` in single task
|
|
91
|
+
tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Setup credentials
|
|
95
|
+
[We recommend to use service account for any production deployments](https://dlthub.com/docs/dlt-ecosystem/verified-sources/google_sheets#google-sheets-api-authentication)
|