ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Fundraiseup source for ingesting donations, events, fundraisers, recurring plans, and supporters."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Generator, Iterable, TypedDict
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
import pendulum
|
|
7
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
|
|
10
|
+
from .client import FundraiseupClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DonationCursor(TypedDict):
|
|
14
|
+
id: str
|
|
15
|
+
created_at: pendulum.DateTime
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def order_by_created(record) -> DonationCursor:
|
|
19
|
+
last_value = None
|
|
20
|
+
if len(record) == 1:
|
|
21
|
+
(record,) = record
|
|
22
|
+
else:
|
|
23
|
+
record, last_value = record
|
|
24
|
+
|
|
25
|
+
cursor: DonationCursor = {
|
|
26
|
+
"id": record["id"],
|
|
27
|
+
"created_at": ensure_pendulum_datetime(record["created_at"]),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if last_value is None:
|
|
31
|
+
return cursor
|
|
32
|
+
|
|
33
|
+
return max(cursor, last_value, key=lambda v: v["created_at"])
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dlt.source(name="fundraiseup", max_table_nesting=0)
|
|
37
|
+
def fundraiseup_source(api_key: str) -> Iterable[DltResource]:
|
|
38
|
+
"""
|
|
39
|
+
Return resources for Fundraiseup API.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
api_key: API key for authentication
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Iterable of DLT resources
|
|
46
|
+
"""
|
|
47
|
+
client = FundraiseupClient(api_key=api_key)
|
|
48
|
+
|
|
49
|
+
# Define available resources and their configurations
|
|
50
|
+
resources = {
|
|
51
|
+
"donations": {"write_disposition": "replace", "primary_key": "id"},
|
|
52
|
+
"events": {"write_disposition": "replace", "primary_key": "id"},
|
|
53
|
+
"fundraisers": {"write_disposition": "replace", "primary_key": "id"},
|
|
54
|
+
"recurring_plans": {"write_disposition": "replace", "primary_key": "id"},
|
|
55
|
+
"supporters": {"write_disposition": "replace", "primary_key": "id"},
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
def create_resource(resource_name: str, config: Dict[str, Any]) -> DltResource:
|
|
59
|
+
"""Create a DLT resource dynamically."""
|
|
60
|
+
|
|
61
|
+
@dlt.resource(
|
|
62
|
+
name=resource_name,
|
|
63
|
+
write_disposition=config["write_disposition"],
|
|
64
|
+
primary_key=config["primary_key"],
|
|
65
|
+
)
|
|
66
|
+
def generic_resource() -> Generator[Dict[str, Any], None, None]:
|
|
67
|
+
"""Generic resource that yields batches directly."""
|
|
68
|
+
for batch in client.get_paginated_data(resource_name):
|
|
69
|
+
yield batch # type: ignore[misc]
|
|
70
|
+
|
|
71
|
+
return generic_resource()
|
|
72
|
+
|
|
73
|
+
@dlt.resource(
|
|
74
|
+
name="donations:incremental",
|
|
75
|
+
write_disposition="merge",
|
|
76
|
+
primary_key="id",
|
|
77
|
+
)
|
|
78
|
+
def donations_incremental(
|
|
79
|
+
last_record: dlt.sources.incremental[DonationCursor] = dlt.sources.incremental(
|
|
80
|
+
"$",
|
|
81
|
+
range_start="closed",
|
|
82
|
+
range_end="closed",
|
|
83
|
+
last_value_func=order_by_created,
|
|
84
|
+
),
|
|
85
|
+
):
|
|
86
|
+
params = {}
|
|
87
|
+
if last_record.last_value is not None:
|
|
88
|
+
params["starting_after"] = last_record.last_value["id"]
|
|
89
|
+
for batch in client.get_paginated_data("donations", params=params):
|
|
90
|
+
yield batch # type: ignore[misc]
|
|
91
|
+
|
|
92
|
+
# Return all resources
|
|
93
|
+
return [donations_incremental] + [
|
|
94
|
+
create_resource(name, config) for name, config in resources.items()
|
|
95
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Fundraiseup API Client for handling authentication and paginated requests."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterator, Optional
|
|
4
|
+
|
|
5
|
+
from ingestr.src.http_client import create_client
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FundraiseupClient:
|
|
9
|
+
"""Client for interacting with Fundraiseup API v1."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, api_key: str):
|
|
12
|
+
"""
|
|
13
|
+
Initialize Fundraiseup API client.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
api_key: API key for authentication
|
|
17
|
+
"""
|
|
18
|
+
self.api_key = api_key
|
|
19
|
+
self.base_url = "https://api.fundraiseup.com/v1"
|
|
20
|
+
# Use shared HTTP client with retry logic for rate limiting
|
|
21
|
+
self.client = create_client(retry_status_codes=[429, 500, 502, 503, 504])
|
|
22
|
+
|
|
23
|
+
def get_paginated_data(
|
|
24
|
+
self,
|
|
25
|
+
endpoint: str,
|
|
26
|
+
params: Optional[Dict[str, Any]] = None,
|
|
27
|
+
page_size: int = 100,
|
|
28
|
+
) -> Iterator[list[Dict[str, Any]]]:
|
|
29
|
+
"""
|
|
30
|
+
Fetch paginated data from a Fundraiseup API endpoint using cursor-based pagination.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
endpoint: API endpoint path (e.g., "donations")
|
|
34
|
+
params: Additional query parameters
|
|
35
|
+
page_size: Number of items per page (default 100)
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
Batches of items from the API
|
|
39
|
+
"""
|
|
40
|
+
url = f"{self.base_url}/{endpoint}"
|
|
41
|
+
headers = {
|
|
42
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
43
|
+
"Content-Type": "application/json",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if params is None:
|
|
47
|
+
params = {}
|
|
48
|
+
|
|
49
|
+
params["limit"] = page_size
|
|
50
|
+
starting_after = None
|
|
51
|
+
|
|
52
|
+
while True:
|
|
53
|
+
# Add cursor for pagination if not first page
|
|
54
|
+
if starting_after:
|
|
55
|
+
params["starting_after"] = starting_after
|
|
56
|
+
|
|
57
|
+
response = self.client.get(url=url, headers=headers, params=params)
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
|
|
60
|
+
data = response.json()
|
|
61
|
+
|
|
62
|
+
# Handle both list response and object with data array
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
items = data
|
|
65
|
+
has_more = len(items) == page_size
|
|
66
|
+
else:
|
|
67
|
+
items = data.get("data", [])
|
|
68
|
+
has_more = data.get("has_more", False)
|
|
69
|
+
|
|
70
|
+
if not items:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
yield items
|
|
74
|
+
|
|
75
|
+
# Set cursor for next page
|
|
76
|
+
if has_more and items:
|
|
77
|
+
starting_after = items[-1].get("id")
|
|
78
|
+
if not starting_after:
|
|
79
|
+
break
|
|
80
|
+
else:
|
|
81
|
+
break
|
ingestr/src/github/__init__.py
CHANGED
|
@@ -4,13 +4,14 @@ import urllib.parse
|
|
|
4
4
|
from typing import Iterator, Optional, Sequence
|
|
5
5
|
|
|
6
6
|
import dlt
|
|
7
|
+
import pendulum
|
|
7
8
|
from dlt.common.typing import TDataItems
|
|
8
9
|
from dlt.sources import DltResource
|
|
9
10
|
|
|
10
11
|
from .helpers import get_reactions_data, get_rest_pages, get_stargazers
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
@dlt.source
|
|
14
|
+
@dlt.source(max_table_nesting=0)
|
|
14
15
|
def github_reactions(
|
|
15
16
|
owner: str,
|
|
16
17
|
name: str,
|
|
@@ -67,7 +68,11 @@ def github_reactions(
|
|
|
67
68
|
|
|
68
69
|
@dlt.source(max_table_nesting=0)
|
|
69
70
|
def github_repo_events(
|
|
70
|
-
owner: str,
|
|
71
|
+
owner: str,
|
|
72
|
+
name: str,
|
|
73
|
+
access_token: str,
|
|
74
|
+
start_date: pendulum.DateTime,
|
|
75
|
+
end_date: Optional[pendulum.DateTime] = None,
|
|
71
76
|
) -> DltResource:
|
|
72
77
|
"""Gets events for repository `name` with owner `owner` incrementally.
|
|
73
78
|
|
|
@@ -86,11 +91,14 @@ def github_repo_events(
|
|
|
86
91
|
"""
|
|
87
92
|
|
|
88
93
|
# use naming function in table name to generate separate tables for each event
|
|
89
|
-
@dlt.resource(
|
|
94
|
+
@dlt.resource(
|
|
95
|
+
primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
|
|
96
|
+
)
|
|
90
97
|
def repo_events(
|
|
91
98
|
last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
92
99
|
"created_at",
|
|
93
|
-
initial_value=
|
|
100
|
+
initial_value=start_date.isoformat(),
|
|
101
|
+
end_value=end_date.isoformat() if end_date else None,
|
|
94
102
|
last_value_func=max,
|
|
95
103
|
range_end="closed",
|
|
96
104
|
range_start="closed",
|
|
@@ -100,8 +108,35 @@ def github_repo_events(
|
|
|
100
108
|
f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
|
|
101
109
|
)
|
|
102
110
|
|
|
111
|
+
# Get the date range from the incremental state
|
|
112
|
+
start_filter = pendulum.parse(
|
|
113
|
+
last_created_at.last_value or last_created_at.initial_value
|
|
114
|
+
)
|
|
115
|
+
end_filter = (
|
|
116
|
+
pendulum.parse(last_created_at.end_value)
|
|
117
|
+
if last_created_at.end_value
|
|
118
|
+
else pendulum.now()
|
|
119
|
+
)
|
|
120
|
+
|
|
103
121
|
for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
|
|
104
|
-
|
|
122
|
+
# Filter events by date range
|
|
123
|
+
filtered_events = []
|
|
124
|
+
for event in page:
|
|
125
|
+
event_date = pendulum.parse(event["created_at"])
|
|
126
|
+
|
|
127
|
+
# Check if event is within the date range
|
|
128
|
+
if event_date >= start_filter:
|
|
129
|
+
if end_filter is None or event_date <= end_filter:
|
|
130
|
+
filtered_events.append(event)
|
|
131
|
+
elif event_date > end_filter:
|
|
132
|
+
# Skip events that are newer than our end date
|
|
133
|
+
continue
|
|
134
|
+
else:
|
|
135
|
+
# Events are ordered by date desc, so if we hit an older event, we can stop
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
if filtered_events:
|
|
139
|
+
yield filtered_events
|
|
105
140
|
|
|
106
141
|
# stop requesting pages if the last element was already older than initial value
|
|
107
142
|
# note: incremental will skip those items anyway, we just do not want to use the api limits
|
|
@@ -114,7 +149,7 @@ def github_repo_events(
|
|
|
114
149
|
return repo_events
|
|
115
150
|
|
|
116
151
|
|
|
117
|
-
@dlt.source
|
|
152
|
+
@dlt.source(max_table_nesting=0)
|
|
118
153
|
def github_stargazers(
|
|
119
154
|
owner: str,
|
|
120
155
|
name: str,
|
ingestr/src/github/helpers.py
CHANGED
|
@@ -103,9 +103,9 @@ def get_reactions_data(
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
def _extract_top_connection(data: StrAny, node_type: str) -> StrAny:
|
|
106
|
-
assert (
|
|
107
|
-
|
|
108
|
-
)
|
|
106
|
+
assert isinstance(data, dict) and len(data) == 1, (
|
|
107
|
+
f"The data with list of {node_type} must be a dictionary and contain only one element"
|
|
108
|
+
)
|
|
109
109
|
data = next(iter(data.values()))
|
|
110
110
|
return data[node_type] # type: ignore
|
|
111
111
|
|
|
@@ -158,7 +158,7 @@ def _get_graphql_pages(
|
|
|
158
158
|
)
|
|
159
159
|
items_count += len(data_items)
|
|
160
160
|
print(
|
|
161
|
-
f
|
|
161
|
+
f"Got {len(data_items)}/{items_count} {node_type}s, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
|
|
162
162
|
)
|
|
163
163
|
if data_items:
|
|
164
164
|
yield data_items
|
|
@@ -187,7 +187,7 @@ def _get_comment_reaction(comment_ids: List[str], access_token: str) -> StrAny:
|
|
|
187
187
|
# print(query)
|
|
188
188
|
page, rate_limit = _run_graphql_query(access_token, query, {})
|
|
189
189
|
print(
|
|
190
|
-
f
|
|
190
|
+
f"Got {len(page)} comments, query cost {rate_limit['cost']}, remaining credits: {rate_limit['remaining']}"
|
|
191
191
|
)
|
|
192
192
|
data.update(page)
|
|
193
193
|
return data
|
|
@@ -7,15 +7,16 @@ from typing import Iterator, List, Optional, Union
|
|
|
7
7
|
import dlt
|
|
8
8
|
from dlt.common import pendulum
|
|
9
9
|
from dlt.common.typing import DictStrAny, TDataItem
|
|
10
|
-
from dlt.
|
|
10
|
+
from dlt.sources import DltResource
|
|
11
11
|
from dlt.sources.credentials import GcpOAuthCredentials, GcpServiceAccountCredentials
|
|
12
12
|
from google.analytics.data_v1beta import BetaAnalyticsDataClient
|
|
13
13
|
from google.analytics.data_v1beta.types import (
|
|
14
14
|
Dimension,
|
|
15
15
|
Metric,
|
|
16
|
+
MinuteRange,
|
|
16
17
|
)
|
|
17
18
|
|
|
18
|
-
from .helpers import get_report
|
|
19
|
+
from .helpers import get_realtime_report, get_report
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
@dlt.source(max_table_nesting=0)
|
|
@@ -29,6 +30,7 @@ def google_analytics(
|
|
|
29
30
|
start_date: Optional[pendulum.DateTime] = pendulum.datetime(2024, 1, 1),
|
|
30
31
|
end_date: Optional[pendulum.DateTime] = None,
|
|
31
32
|
rows_per_page: int = 10000,
|
|
33
|
+
minute_range_objects: List[MinuteRange] | None = None,
|
|
32
34
|
) -> List[DltResource]:
|
|
33
35
|
try:
|
|
34
36
|
property_id = int(property_id)
|
|
@@ -58,7 +60,7 @@ def google_analytics(
|
|
|
58
60
|
dimensions = query["dimensions"]
|
|
59
61
|
|
|
60
62
|
@dlt.resource(
|
|
61
|
-
name="
|
|
63
|
+
name="custom",
|
|
62
64
|
merge_key=datetime_dimension,
|
|
63
65
|
write_disposition="merge",
|
|
64
66
|
)
|
|
@@ -87,6 +89,22 @@ def google_analytics(
|
|
|
87
89
|
end_date=end_date,
|
|
88
90
|
)
|
|
89
91
|
|
|
92
|
+
# real time report
|
|
93
|
+
@dlt.resource(
|
|
94
|
+
name="realtime",
|
|
95
|
+
merge_key="ingested_at",
|
|
96
|
+
write_disposition="merge",
|
|
97
|
+
)
|
|
98
|
+
def real_time_report() -> Iterator[TDataItem]:
|
|
99
|
+
yield from get_realtime_report(
|
|
100
|
+
client=client,
|
|
101
|
+
property_id=property_id,
|
|
102
|
+
dimension_list=[Dimension(name=dimension) for dimension in dimensions],
|
|
103
|
+
metric_list=[Metric(name=metric) for metric in query["metrics"]],
|
|
104
|
+
per_page=rows_per_page,
|
|
105
|
+
minute_range_objects=minute_range_objects,
|
|
106
|
+
)
|
|
107
|
+
|
|
90
108
|
# res = dlt.resource(
|
|
91
109
|
# basic_report, name="basic_report", merge_key=datetime_dimension, write_disposition="merge"
|
|
92
110
|
# )(
|
|
@@ -103,4 +121,4 @@ def google_analytics(
|
|
|
103
121
|
# ),
|
|
104
122
|
# )
|
|
105
123
|
|
|
106
|
-
return [basic_report]
|
|
124
|
+
return [basic_report, real_time_report]
|
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
This module contains helpers that process data and make it ready for loading into the database
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
from typing import Any, Iterator, List, Union
|
|
8
|
+
from urllib.parse import parse_qs, urlparse
|
|
7
9
|
|
|
8
10
|
import proto
|
|
9
11
|
from dlt.common.exceptions import MissingDependencyException
|
|
@@ -22,6 +24,8 @@ try:
|
|
|
22
24
|
Metric,
|
|
23
25
|
MetricMetadata, # noqa: F401
|
|
24
26
|
MetricType,
|
|
27
|
+
MinuteRange,
|
|
28
|
+
RunRealtimeReportRequest,
|
|
25
29
|
RunReportRequest,
|
|
26
30
|
RunReportResponse,
|
|
27
31
|
)
|
|
@@ -52,6 +56,53 @@ def to_dict(item: Any) -> Iterator[TDataItem]:
|
|
|
52
56
|
yield item
|
|
53
57
|
|
|
54
58
|
|
|
59
|
+
def get_realtime_report(
|
|
60
|
+
client: Resource,
|
|
61
|
+
property_id: int,
|
|
62
|
+
dimension_list: List[Dimension],
|
|
63
|
+
metric_list: List[Metric],
|
|
64
|
+
per_page: int,
|
|
65
|
+
minute_range_objects: List[MinuteRange] | None = None,
|
|
66
|
+
) -> Iterator[TDataItem]:
|
|
67
|
+
"""
|
|
68
|
+
Gets all the possible pages of reports with the given query parameters.
|
|
69
|
+
Processes every page and yields a dictionary for every row of the report.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
client: The Google Analytics client used to make requests.
|
|
73
|
+
property_id: A reference to the Google Analytics project.
|
|
74
|
+
More info: https://developers.google.com/analytics/devguides/reporting/data/v1/property-id
|
|
75
|
+
dimension_list: A list of all the dimensions requested in the query.
|
|
76
|
+
metric_list: A list of all the metrics requested in the query.
|
|
77
|
+
limit: Describes how many rows there should be per page.
|
|
78
|
+
|
|
79
|
+
Yields:
|
|
80
|
+
Generator of all rows of data in the report.
|
|
81
|
+
"""
|
|
82
|
+
offset = 0
|
|
83
|
+
ingest_at = pendulum.now().to_date_string()
|
|
84
|
+
|
|
85
|
+
while True:
|
|
86
|
+
request = RunRealtimeReportRequest(
|
|
87
|
+
property=f"properties/{property_id}",
|
|
88
|
+
dimensions=dimension_list,
|
|
89
|
+
metrics=metric_list,
|
|
90
|
+
limit=per_page,
|
|
91
|
+
minute_ranges=minute_range_objects if minute_range_objects else None,
|
|
92
|
+
)
|
|
93
|
+
response = client.run_realtime_report(request)
|
|
94
|
+
|
|
95
|
+
# process request
|
|
96
|
+
processed_response_generator = process_report(
|
|
97
|
+
response=response, ingest_at=ingest_at
|
|
98
|
+
)
|
|
99
|
+
# import pdb; pdb.set_trace()
|
|
100
|
+
yield from processed_response_generator
|
|
101
|
+
offset += per_page
|
|
102
|
+
if len(response.rows) < per_page or offset > 1000000:
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
|
|
55
106
|
def get_report(
|
|
56
107
|
client: Resource,
|
|
57
108
|
property_id: int,
|
|
@@ -79,10 +130,6 @@ def get_report(
|
|
|
79
130
|
Generator of all rows of data in the report.
|
|
80
131
|
"""
|
|
81
132
|
|
|
82
|
-
print(
|
|
83
|
-
"fetching for daterange", start_date.to_date_string(), end_date.to_date_string()
|
|
84
|
-
)
|
|
85
|
-
|
|
86
133
|
offset = 0
|
|
87
134
|
while True:
|
|
88
135
|
request = RunReportRequest(
|
|
@@ -98,9 +145,11 @@ def get_report(
|
|
|
98
145
|
)
|
|
99
146
|
],
|
|
100
147
|
)
|
|
101
|
-
# process request
|
|
102
148
|
response = client.run_report(request)
|
|
149
|
+
|
|
150
|
+
# process request
|
|
103
151
|
processed_response_generator = process_report(response=response)
|
|
152
|
+
|
|
104
153
|
# import pdb; pdb.set_trace()
|
|
105
154
|
yield from processed_response_generator
|
|
106
155
|
offset += per_page
|
|
@@ -108,7 +157,9 @@ def get_report(
|
|
|
108
157
|
break
|
|
109
158
|
|
|
110
159
|
|
|
111
|
-
def process_report(
|
|
160
|
+
def process_report(
|
|
161
|
+
response: RunReportResponse, ingest_at: str | None = None
|
|
162
|
+
) -> Iterator[TDataItems]:
|
|
112
163
|
metrics_headers = [header.name for header in response.metric_headers]
|
|
113
164
|
dimensions_headers = [header.name for header in response.dimension_headers]
|
|
114
165
|
|
|
@@ -131,6 +182,8 @@ def process_report(response: RunReportResponse) -> Iterator[TDataItems]:
|
|
|
131
182
|
metric_type=metric_type, value=row.metric_values[i].value
|
|
132
183
|
)
|
|
133
184
|
response_dict[metrics_headers[i]] = metric_value
|
|
185
|
+
if ingest_at is not None:
|
|
186
|
+
response_dict["ingested_at"] = ingest_at
|
|
134
187
|
|
|
135
188
|
unique_key = "-".join(list(response_dict.keys()))
|
|
136
189
|
if unique_key not in distinct_key_combinations:
|
|
@@ -170,3 +223,68 @@ def _resolve_dimension_value(dimension_name: str, dimension_value: str) -> Any:
|
|
|
170
223
|
return pendulum.from_format(dimension_value, "YYYYMMDDHHmm", tz="UTC")
|
|
171
224
|
else:
|
|
172
225
|
return dimension_value
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def convert_minutes_ranges_to_minute_range_objects(
|
|
229
|
+
minutes_ranges: str,
|
|
230
|
+
) -> List[MinuteRange]:
|
|
231
|
+
minutes_ranges = minutes_ranges.strip()
|
|
232
|
+
minutes = minutes_ranges.replace(" ", "").split(",")
|
|
233
|
+
if minutes == "":
|
|
234
|
+
raise ValueError(
|
|
235
|
+
"Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
minute_range_objects = []
|
|
239
|
+
for min_range in minutes:
|
|
240
|
+
if "-" not in min_range:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
"Invalid input. Minutes range should be startminute-endminute format. For example: 1-2,5-6"
|
|
243
|
+
)
|
|
244
|
+
parts = min_range.split("-")
|
|
245
|
+
|
|
246
|
+
if not parts[0].isdigit() or not parts[1].isdigit():
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"Invalid input '{min_range}'. Both start and end minutes must be digits. For example: 1-2,5-6"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
end_minutes_ago = int(parts[0])
|
|
252
|
+
start_minutes_ago = int(parts[1])
|
|
253
|
+
minute_range_objects.append(
|
|
254
|
+
MinuteRange(
|
|
255
|
+
name=f"{end_minutes_ago}-{start_minutes_ago} minutes ago",
|
|
256
|
+
start_minutes_ago=start_minutes_ago,
|
|
257
|
+
end_minutes_ago=end_minutes_ago,
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return minute_range_objects
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def parse_google_analytics_uri(uri: str):
|
|
265
|
+
parse_uri = urlparse(uri)
|
|
266
|
+
source_fields = parse_qs(parse_uri.query)
|
|
267
|
+
cred_path = source_fields.get("credentials_path")
|
|
268
|
+
cred_base64 = source_fields.get("credentials_base64")
|
|
269
|
+
|
|
270
|
+
if not cred_path and not cred_base64:
|
|
271
|
+
raise ValueError(
|
|
272
|
+
"credentials_path or credentials_base64 is required to connect Google Analytics"
|
|
273
|
+
)
|
|
274
|
+
credentials = {}
|
|
275
|
+
if cred_path:
|
|
276
|
+
with open(cred_path[0], "r") as f:
|
|
277
|
+
credentials = json.load(f)
|
|
278
|
+
elif cred_base64:
|
|
279
|
+
credentials = json.loads(base64.b64decode(cred_base64[0]).decode("utf-8"))
|
|
280
|
+
|
|
281
|
+
property_id = source_fields.get("property_id")
|
|
282
|
+
if not property_id:
|
|
283
|
+
raise ValueError("property_id is required to connect to Google Analytics")
|
|
284
|
+
|
|
285
|
+
if (not cred_path and not cred_base64) or (not property_id):
|
|
286
|
+
raise ValueError(
|
|
287
|
+
"credentials_path or credentials_base64 and property_id are required to connect Google Analytics"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
return {"credentials": credentials, "property_id": property_id[0]}
|
|
@@ -70,9 +70,9 @@ def google_spreadsheet(
|
|
|
70
70
|
spreadsheet_id=spreadsheet_id,
|
|
71
71
|
range_names=list(all_range_names),
|
|
72
72
|
)
|
|
73
|
-
assert len(all_range_names) == len(
|
|
74
|
-
|
|
75
|
-
)
|
|
73
|
+
assert len(all_range_names) == len(all_range_data), (
|
|
74
|
+
"Google Sheets API must return values for all requested ranges"
|
|
75
|
+
)
|
|
76
76
|
|
|
77
77
|
# get metadata for two first rows of each range
|
|
78
78
|
# first should contain headers
|
|
@@ -126,7 +126,7 @@ def google_spreadsheet(
|
|
|
126
126
|
headers = get_range_headers(headers_metadata, name)
|
|
127
127
|
if headers is None:
|
|
128
128
|
# generate automatic headers and treat the first row as data
|
|
129
|
-
headers = [f"col_{idx+1}" for idx in range(len(headers_metadata))]
|
|
129
|
+
headers = [f"col_{idx + 1}" for idx in range(len(headers_metadata))]
|
|
130
130
|
data_row_metadata = headers_metadata
|
|
131
131
|
rows_data = values[0:]
|
|
132
132
|
logger.warning(
|
|
@@ -149,12 +149,12 @@ def get_range_headers(headers_metadata: List[DictStrAny], range_name: str) -> Li
|
|
|
149
149
|
header_val = str(f"col_{idx + 1}")
|
|
150
150
|
else:
|
|
151
151
|
logger.warning(
|
|
152
|
-
f"In range {range_name}, header value: {header_val} at position {idx+1} is not a string!"
|
|
152
|
+
f"In range {range_name}, header value: {header_val} at position {idx + 1} is not a string!"
|
|
153
153
|
)
|
|
154
154
|
return None
|
|
155
155
|
else:
|
|
156
156
|
logger.warning(
|
|
157
|
-
f"In range {range_name}, header at position {idx+1} is not missing!"
|
|
157
|
+
f"In range {range_name}, header at position {idx + 1} is not missing!"
|
|
158
158
|
)
|
|
159
159
|
return None
|
|
160
160
|
headers.append(header_val)
|