ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -7,13 +7,16 @@ import stripe
|
|
|
7
7
|
from dlt.sources import DltResource
|
|
8
8
|
from pendulum import DateTime
|
|
9
9
|
|
|
10
|
-
from .helpers import
|
|
11
|
-
|
|
10
|
+
from .helpers import (
|
|
11
|
+
async_parallel_pagination,
|
|
12
|
+
pagination,
|
|
13
|
+
transform_date,
|
|
14
|
+
)
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
@dlt.source(max_table_nesting=0)
|
|
15
18
|
def stripe_source(
|
|
16
|
-
endpoints: Tuple[str, ...]
|
|
19
|
+
endpoints: Tuple[str, ...],
|
|
17
20
|
stripe_secret_key: str = dlt.secrets.value,
|
|
18
21
|
start_date: Optional[DateTime] = None,
|
|
19
22
|
end_date: Optional[DateTime] = None,
|
|
@@ -51,32 +54,55 @@ def stripe_source(
|
|
|
51
54
|
)(endpoint)
|
|
52
55
|
|
|
53
56
|
|
|
54
|
-
@dlt.source
|
|
55
|
-
def
|
|
56
|
-
endpoints: Tuple[str, ...]
|
|
57
|
+
@dlt.source(max_table_nesting=0)
|
|
58
|
+
def async_stripe_source(
|
|
59
|
+
endpoints: Tuple[str, ...],
|
|
57
60
|
stripe_secret_key: str = dlt.secrets.value,
|
|
58
|
-
|
|
61
|
+
start_date: Optional[DateTime] = None,
|
|
59
62
|
end_date: Optional[DateTime] = None,
|
|
63
|
+
max_workers: int = 4,
|
|
64
|
+
rate_limit_delay: float = 0.03,
|
|
60
65
|
) -> Iterable[DltResource]:
|
|
61
66
|
"""
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
67
|
+
ULTRA-FAST async Stripe source optimized for maximum speed and throughput.
|
|
68
|
+
|
|
69
|
+
WARNING: Returns data in RANDOM ORDER for maximum performance.
|
|
70
|
+
Uses aggressive concurrency and minimal delays to maximize API throughput.
|
|
66
71
|
|
|
67
72
|
Args:
|
|
68
|
-
endpoints (
|
|
73
|
+
endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
|
|
69
74
|
stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
+
start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to 2010-01-01.
|
|
76
|
+
end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to today.
|
|
77
|
+
max_workers (int): Maximum number of concurrent async tasks. Defaults to 40 for maximum speed.
|
|
78
|
+
rate_limit_delay (float): Minimal delay between requests. Defaults to 0.03 seconds.
|
|
79
|
+
|
|
75
80
|
Returns:
|
|
76
|
-
Iterable[DltResource]: Resources with
|
|
81
|
+
Iterable[DltResource]: Resources with data in RANDOM ORDER (optimized for speed).
|
|
77
82
|
"""
|
|
78
83
|
stripe.api_key = stripe_secret_key
|
|
79
84
|
stripe.api_version = "2022-11-15"
|
|
85
|
+
|
|
86
|
+
async def async_stripe_resource(endpoint: str):
|
|
87
|
+
yield async_parallel_pagination(endpoint, max_workers, rate_limit_delay)
|
|
88
|
+
|
|
89
|
+
for endpoint in endpoints:
|
|
90
|
+
yield dlt.resource(
|
|
91
|
+
async_stripe_resource,
|
|
92
|
+
name=endpoint,
|
|
93
|
+
write_disposition="replace",
|
|
94
|
+
)(endpoint)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dlt.source(max_table_nesting=0)
|
|
98
|
+
def incremental_stripe_source(
|
|
99
|
+
endpoints: Tuple[str, ...],
|
|
100
|
+
stripe_secret_key: str = dlt.secrets.value,
|
|
101
|
+
initial_start_date: Optional[DateTime] = None,
|
|
102
|
+
end_date: Optional[DateTime] = None,
|
|
103
|
+
) -> Iterable[DltResource]:
|
|
104
|
+
stripe.api_key = stripe_secret_key
|
|
105
|
+
stripe.api_version = "2022-11-15"
|
|
80
106
|
start_date_unix = (
|
|
81
107
|
transform_date(initial_start_date) if initial_start_date is not None else -1
|
|
82
108
|
)
|
|
@@ -86,17 +112,19 @@ def incremental_stripe_source(
|
|
|
86
112
|
created: Optional[Any] = dlt.sources.incremental(
|
|
87
113
|
"created",
|
|
88
114
|
initial_value=start_date_unix,
|
|
115
|
+
end_value=transform_date(end_date) if end_date is not None else None,
|
|
89
116
|
range_end="closed",
|
|
90
117
|
range_start="closed",
|
|
91
118
|
),
|
|
92
119
|
) -> Generator[Dict[Any, Any], Any, None]:
|
|
93
|
-
|
|
94
|
-
|
|
120
|
+
yield from pagination(
|
|
121
|
+
endpoint, start_date=created.last_value, end_date=created.end_value
|
|
122
|
+
)
|
|
95
123
|
|
|
96
124
|
for endpoint in endpoints:
|
|
97
125
|
yield dlt.resource(
|
|
98
126
|
incremental_resource,
|
|
99
127
|
name=endpoint,
|
|
100
|
-
write_disposition="
|
|
128
|
+
write_disposition="merge",
|
|
101
129
|
primary_key="id",
|
|
102
130
|
)(endpoint)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
"""Stripe analytics source helpers"""
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import asyncio
|
|
4
|
+
import math
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
4
7
|
|
|
5
8
|
import stripe
|
|
6
9
|
from dlt.common import pendulum
|
|
@@ -39,6 +42,238 @@ def pagination(
|
|
|
39
42
|
break
|
|
40
43
|
|
|
41
44
|
|
|
45
|
+
def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
|
|
46
|
+
"""
|
|
47
|
+
Divide a time range into equal chunks for parallel processing.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
start_ts (int): Start timestamp
|
|
51
|
+
end_ts (int): End timestamp
|
|
52
|
+
num_chunks (int): Number of chunks to create
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
|
|
56
|
+
"""
|
|
57
|
+
total_duration = end_ts - start_ts
|
|
58
|
+
chunk_duration = math.ceil(total_duration / num_chunks)
|
|
59
|
+
|
|
60
|
+
chunks = []
|
|
61
|
+
current_start = start_ts
|
|
62
|
+
|
|
63
|
+
for i in range(num_chunks):
|
|
64
|
+
current_end = min(current_start + chunk_duration, end_ts)
|
|
65
|
+
if current_start < end_ts:
|
|
66
|
+
chunks.append((current_start, current_end))
|
|
67
|
+
current_start = current_end
|
|
68
|
+
|
|
69
|
+
if current_start >= end_ts:
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
return chunks
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _create_adaptive_time_chunks(
|
|
76
|
+
start_ts: int, end_ts: int, max_workers: int
|
|
77
|
+
) -> List[tuple]:
|
|
78
|
+
"""
|
|
79
|
+
Create time chunks with adaptive sizing - larger chunks for 2010s (less data expected).
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
start_ts (int): Start timestamp
|
|
83
|
+
end_ts (int): End timestamp
|
|
84
|
+
max_workers (int): Maximum number of workers
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
|
|
88
|
+
"""
|
|
89
|
+
chunks = []
|
|
90
|
+
|
|
91
|
+
# Key timestamps
|
|
92
|
+
year_2020_ts = int(pendulum.datetime(2020, 1, 1).timestamp())
|
|
93
|
+
year_2015_ts = int(pendulum.datetime(2015, 1, 1).timestamp())
|
|
94
|
+
|
|
95
|
+
current_start = start_ts
|
|
96
|
+
|
|
97
|
+
# Handle 2010-2015: Large chunks (2-3 year periods)
|
|
98
|
+
if current_start < year_2015_ts:
|
|
99
|
+
chunk_end = min(year_2015_ts, end_ts)
|
|
100
|
+
if current_start < chunk_end:
|
|
101
|
+
# Split 2010-2015 into 2-3 chunks max
|
|
102
|
+
pre_2015_chunks = _create_time_chunks(
|
|
103
|
+
current_start, chunk_end, min(3, max_workers)
|
|
104
|
+
)
|
|
105
|
+
chunks.extend(pre_2015_chunks)
|
|
106
|
+
current_start = chunk_end
|
|
107
|
+
|
|
108
|
+
# Handle 2015-2020: Medium chunks (6 month to 1 year periods)
|
|
109
|
+
if current_start < year_2020_ts and current_start < end_ts:
|
|
110
|
+
chunk_end = min(year_2020_ts, end_ts)
|
|
111
|
+
if current_start < chunk_end:
|
|
112
|
+
# Split 2015-2020 into smaller chunks
|
|
113
|
+
duration_2015_2020 = chunk_end - current_start
|
|
114
|
+
years_2015_2020 = duration_2015_2020 / (365 * 24 * 60 * 60)
|
|
115
|
+
num_chunks_2015_2020 = min(
|
|
116
|
+
max_workers, max(2, int(years_2015_2020 * 2))
|
|
117
|
+
) # ~6 months per chunk
|
|
118
|
+
|
|
119
|
+
pre_2020_chunks = _create_time_chunks(
|
|
120
|
+
current_start, chunk_end, num_chunks_2015_2020
|
|
121
|
+
)
|
|
122
|
+
chunks.extend(pre_2020_chunks)
|
|
123
|
+
current_start = chunk_end
|
|
124
|
+
|
|
125
|
+
if current_start < end_ts:
|
|
126
|
+
# Split post-2020 data into daily chunks for maximum granularity
|
|
127
|
+
current_chunk_start = current_start
|
|
128
|
+
while current_chunk_start < end_ts:
|
|
129
|
+
# Calculate end of current day
|
|
130
|
+
current_date = datetime.fromtimestamp(current_chunk_start)
|
|
131
|
+
next_day = current_date + timedelta(days=1)
|
|
132
|
+
chunk_end = min(int(next_day.timestamp()), end_ts)
|
|
133
|
+
|
|
134
|
+
chunks.append((current_chunk_start, chunk_end))
|
|
135
|
+
current_chunk_start = chunk_end
|
|
136
|
+
|
|
137
|
+
return chunks
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _fetch_chunk_data_streaming(
|
|
141
|
+
endpoint: str, start_ts: int, end_ts: int
|
|
142
|
+
) -> List[List[TDataItem]]:
|
|
143
|
+
"""
|
|
144
|
+
Fetch data for a specific time chunk using sequential pagination with memory-efficient approach.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
endpoint (str): The Stripe endpoint to fetch from
|
|
148
|
+
start_ts (int): Start timestamp for this chunk
|
|
149
|
+
end_ts (int): End timestamp for this chunk
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List[List[TDataItem]]: List of batches of data items
|
|
153
|
+
"""
|
|
154
|
+
# For streaming, we still need to collect the chunk data to maintain order
|
|
155
|
+
# but we can optimize by not holding all data in memory at once
|
|
156
|
+
print(
|
|
157
|
+
f"Fetching chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
|
|
158
|
+
)
|
|
159
|
+
chunk_data = []
|
|
160
|
+
batch_count = 0
|
|
161
|
+
|
|
162
|
+
for batch in pagination(endpoint, start_ts, end_ts):
|
|
163
|
+
chunk_data.append(batch)
|
|
164
|
+
print(
|
|
165
|
+
f"Processed {batch_count} batches for chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
|
|
166
|
+
)
|
|
167
|
+
batch_count += 1
|
|
168
|
+
|
|
169
|
+
return chunk_data
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
async def async_pagination(
|
|
173
|
+
endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
|
|
174
|
+
) -> Iterable[TDataItem]:
|
|
175
|
+
"""
|
|
176
|
+
Async version of pagination that retrieves data from an endpoint with pagination.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
endpoint (str): The endpoint to retrieve data from.
|
|
180
|
+
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
|
|
181
|
+
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Iterable[TDataItem]: Data items retrieved from the endpoint.
|
|
185
|
+
"""
|
|
186
|
+
starting_after = None
|
|
187
|
+
while True:
|
|
188
|
+
response = await stripe_get_data_async(
|
|
189
|
+
endpoint,
|
|
190
|
+
start_date=start_date,
|
|
191
|
+
end_date=end_date,
|
|
192
|
+
starting_after=starting_after,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if len(response["data"]) > 0:
|
|
196
|
+
starting_after = response["data"][-1]["id"]
|
|
197
|
+
yield response["data"]
|
|
198
|
+
|
|
199
|
+
if not response["has_more"]:
|
|
200
|
+
break
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def async_parallel_pagination(
|
|
204
|
+
endpoint: str,
|
|
205
|
+
max_workers: int = 8,
|
|
206
|
+
rate_limit_delay: float = 5,
|
|
207
|
+
) -> Iterable[TDataItem]:
|
|
208
|
+
"""
|
|
209
|
+
ULTRA-FAST async parallel pagination - yields data in random order for maximum speed.
|
|
210
|
+
No ordering constraints - pure performance optimization.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
endpoint (str): The endpoint to retrieve data from.
|
|
214
|
+
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
|
|
215
|
+
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
|
|
216
|
+
max_workers (int): Maximum number of concurrent async tasks. Defaults to 8 for balanced speed/rate limit respect.
|
|
217
|
+
rate_limit_delay (float): Minimal delay between requests. Defaults to 5 seconds.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Iterable[TDataItem]: Data items retrieved from the endpoint (RANDOM ORDER FOR SPEED).
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
start_date = pendulum.datetime(2010, 1, 1)
|
|
224
|
+
end_date = pendulum.now()
|
|
225
|
+
start_ts = transform_date(start_date)
|
|
226
|
+
end_ts = transform_date(end_date)
|
|
227
|
+
|
|
228
|
+
# Create time chunks with larger chunks for 2010s (less data expected)
|
|
229
|
+
time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
|
|
230
|
+
|
|
231
|
+
# Use asyncio semaphore to control concurrency and respect rate limits
|
|
232
|
+
semaphore = asyncio.Semaphore(max_workers)
|
|
233
|
+
|
|
234
|
+
async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
|
|
235
|
+
async with semaphore:
|
|
236
|
+
return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
|
|
237
|
+
|
|
238
|
+
# Create all tasks
|
|
239
|
+
tasks = [
|
|
240
|
+
fetch_chunk_with_semaphore(chunk_start, chunk_end)
|
|
241
|
+
for chunk_start, chunk_end in time_chunks
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
for coro in asyncio.as_completed(tasks):
|
|
245
|
+
try:
|
|
246
|
+
chunk_data = await coro
|
|
247
|
+
|
|
248
|
+
for batch in chunk_data:
|
|
249
|
+
yield batch
|
|
250
|
+
|
|
251
|
+
except Exception as exc:
|
|
252
|
+
print(f"Async chunk processing generated an exception: {exc}")
|
|
253
|
+
raise exc
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
async def _fetch_chunk_data_async_fast(
|
|
257
|
+
endpoint: str, start_ts: int, end_ts: int
|
|
258
|
+
) -> List[List[TDataItem]]:
|
|
259
|
+
"""
|
|
260
|
+
ULTRA-FAST async chunk fetcher - no metadata overhead, direct data return.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
endpoint (str): The Stripe endpoint to fetch from
|
|
264
|
+
start_ts (int): Start timestamp for this chunk
|
|
265
|
+
end_ts (int): End timestamp for this chunk
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
List[List[TDataItem]]: Raw batches with zero overhead
|
|
269
|
+
"""
|
|
270
|
+
chunk_data = []
|
|
271
|
+
async for batch in async_pagination(endpoint, start_ts, end_ts):
|
|
272
|
+
chunk_data.append(batch)
|
|
273
|
+
|
|
274
|
+
return chunk_data
|
|
275
|
+
|
|
276
|
+
|
|
42
277
|
def transform_date(date: Union[str, DateTime, int]) -> int:
|
|
43
278
|
if isinstance(date, str):
|
|
44
279
|
date = pendulum.from_format(date, "%Y-%m-%dT%H:%M:%SZ")
|
|
@@ -66,3 +301,53 @@ def stripe_get_data(
|
|
|
66
301
|
created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
|
|
67
302
|
)
|
|
68
303
|
return dict(resource_dict)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def stripe_get_data_async(
|
|
307
|
+
resource: str,
|
|
308
|
+
start_date: Optional[Any] = None,
|
|
309
|
+
end_date: Optional[Any] = None,
|
|
310
|
+
**kwargs: Any,
|
|
311
|
+
) -> Dict[Any, Any]:
|
|
312
|
+
"""Async version of stripe_get_data"""
|
|
313
|
+
if start_date:
|
|
314
|
+
start_date = transform_date(start_date)
|
|
315
|
+
if end_date:
|
|
316
|
+
end_date = transform_date(end_date)
|
|
317
|
+
|
|
318
|
+
if resource == "Subscription":
|
|
319
|
+
kwargs.update({"status": "all"})
|
|
320
|
+
|
|
321
|
+
import asyncio
|
|
322
|
+
|
|
323
|
+
from stripe import RateLimitError
|
|
324
|
+
|
|
325
|
+
max_retries = 50
|
|
326
|
+
retry_count = 0
|
|
327
|
+
max_wait_time_ms = 10000
|
|
328
|
+
|
|
329
|
+
while retry_count < max_retries:
|
|
330
|
+
# print(
|
|
331
|
+
# f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
|
|
332
|
+
# flush=True,
|
|
333
|
+
# )
|
|
334
|
+
try:
|
|
335
|
+
resource_dict = await getattr(stripe, resource).list_async(
|
|
336
|
+
created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
|
|
337
|
+
)
|
|
338
|
+
return dict(resource_dict)
|
|
339
|
+
except RateLimitError:
|
|
340
|
+
retry_count += 1
|
|
341
|
+
if retry_count < max_retries:
|
|
342
|
+
wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
|
|
343
|
+
print(
|
|
344
|
+
f"Got rate limited, sleeping {wait_time} seconds before retrying...",
|
|
345
|
+
flush=True,
|
|
346
|
+
)
|
|
347
|
+
await asyncio.sleep(wait_time)
|
|
348
|
+
else:
|
|
349
|
+
# Re-raise the last exception if we've exhausted retries
|
|
350
|
+
print(f"✗ Failed to fetch {resource} after {max_retries} retries")
|
|
351
|
+
raise
|
|
352
|
+
|
|
353
|
+
return dict(resource_dict)
|
|
@@ -2,13 +2,65 @@
|
|
|
2
2
|
|
|
3
3
|
# the most popular endpoints
|
|
4
4
|
# Full list of the Stripe API endpoints you can find here: https://stripe.com/docs/api.
|
|
5
|
-
ENDPOINTS =
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
5
|
+
ENDPOINTS = {
|
|
6
|
+
"account": "Account",
|
|
7
|
+
"applepaydomain": "ApplePayDomain",
|
|
8
|
+
"apple_pay_domain": "ApplePayDomain",
|
|
9
|
+
"applicationfee": "ApplicationFee",
|
|
10
|
+
"application_fee": "ApplicationFee",
|
|
11
|
+
"checkoutsession": "CheckoutSession",
|
|
12
|
+
"checkout_session": "CheckoutSession",
|
|
13
|
+
"coupon": "Coupon",
|
|
14
|
+
"charge": "Charge",
|
|
15
|
+
"customer": "Customer",
|
|
16
|
+
"dispute": "Dispute",
|
|
17
|
+
"paymentintent": "PaymentIntent",
|
|
18
|
+
"payment_intent": "PaymentIntent",
|
|
19
|
+
"paymentlink": "PaymentLink",
|
|
20
|
+
"payment_link": "PaymentLink",
|
|
21
|
+
"paymentmethod": "PaymentMethod",
|
|
22
|
+
"payment_method": "PaymentMethod",
|
|
23
|
+
"paymentmethoddomain": "PaymentMethodDomain",
|
|
24
|
+
"payment_method_domain": "PaymentMethodDomain",
|
|
25
|
+
"payout": "Payout",
|
|
26
|
+
"plan": "Plan",
|
|
27
|
+
"price": "Price",
|
|
28
|
+
"product": "Product",
|
|
29
|
+
"promotioncode": "PromotionCode",
|
|
30
|
+
"promotion_code": "PromotionCode",
|
|
31
|
+
"quote": "Quote",
|
|
32
|
+
"refund": "Refund",
|
|
33
|
+
"review": "Review",
|
|
34
|
+
"setupattempt": "SetupAttempt",
|
|
35
|
+
"setup_attempt": "SetupAttempt",
|
|
36
|
+
"setupintent": "SetupIntent",
|
|
37
|
+
"setup_intent": "SetupIntent",
|
|
38
|
+
"shippingrate": "ShippingRate",
|
|
39
|
+
"shipping_rate": "ShippingRate",
|
|
40
|
+
"subscription": "Subscription",
|
|
41
|
+
"subscriptionitem": "SubscriptionItem",
|
|
42
|
+
"subscription_item": "SubscriptionItem",
|
|
43
|
+
"subscriptionschedule": "SubscriptionSchedule",
|
|
44
|
+
"subscription_schedule": "SubscriptionSchedule",
|
|
45
|
+
"transfer": "Transfer",
|
|
46
|
+
"taxcode": "TaxCode",
|
|
47
|
+
"tax_code": "TaxCode",
|
|
48
|
+
"taxid": "TaxId",
|
|
49
|
+
"tax_id": "TaxId",
|
|
50
|
+
"taxrate": "TaxRate",
|
|
51
|
+
"tax_rate": "TaxRate",
|
|
52
|
+
"topup": "Topup",
|
|
53
|
+
"top_up": "Topup",
|
|
54
|
+
"webhookendpoint": "WebhookEndpoint",
|
|
55
|
+
"webhook_endpoint": "WebhookEndpoint",
|
|
56
|
+
"invoice": "Invoice",
|
|
57
|
+
"invoiceitem": "InvoiceItem",
|
|
58
|
+
"invoice_item": "InvoiceItem",
|
|
59
|
+
"invoicelineitem": "InvoiceLineItem",
|
|
60
|
+
"invoice_line_item": "InvoiceLineItem",
|
|
61
|
+
"balancetransaction": "BalanceTransaction",
|
|
62
|
+
"balance_transaction": "BalanceTransaction",
|
|
63
|
+
"creditnote": "CreditNote",
|
|
64
|
+
"credit_note": "CreditNote",
|
|
65
|
+
"event": "Event",
|
|
66
|
+
}
|
ingestr/src/telemetry/event.py
CHANGED
|
@@ -1,13 +1,4 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import platform
|
|
3
|
-
|
|
4
|
-
import machineid
|
|
5
|
-
import rudderstack.analytics as rudder_analytics # type: ignore
|
|
6
|
-
|
|
7
|
-
from ingestr.src.version import __version__ # type: ignore
|
|
8
|
-
|
|
9
|
-
rudder_analytics.write_key = "2cUr13DDQcX2x2kAfMEfdrKvrQa"
|
|
10
|
-
rudder_analytics.dataPlaneUrl = "https://getbruinbumlky.dataplane.rudderstack.com"
|
|
11
2
|
|
|
12
3
|
|
|
13
4
|
def track(event_name, event_properties: dict):
|
|
@@ -16,6 +7,16 @@ def track(event_name, event_properties: dict):
|
|
|
16
7
|
):
|
|
17
8
|
return
|
|
18
9
|
|
|
10
|
+
import platform
|
|
11
|
+
|
|
12
|
+
import machineid
|
|
13
|
+
import rudderstack.analytics as rudder_analytics # type: ignore
|
|
14
|
+
|
|
15
|
+
from ingestr.src.version import __version__ # type: ignore
|
|
16
|
+
|
|
17
|
+
rudder_analytics.write_key = "2cUr13DDQcX2x2kAfMEfdrKvrQa"
|
|
18
|
+
rudder_analytics.dataPlaneUrl = "https://getbruinbumlky.dataplane.rudderstack.com"
|
|
19
|
+
|
|
19
20
|
try:
|
|
20
21
|
if not event_properties:
|
|
21
22
|
event_properties = {}
|
|
@@ -112,7 +112,8 @@ def tiktok_source(
|
|
|
112
112
|
datetime=(
|
|
113
113
|
dlt.sources.incremental(
|
|
114
114
|
incremental_loading_param,
|
|
115
|
-
start_date,
|
|
115
|
+
initial_value=start_date,
|
|
116
|
+
end_value=end_date,
|
|
116
117
|
range_end="closed",
|
|
117
118
|
range_start="closed",
|
|
118
119
|
)
|
|
@@ -120,15 +121,20 @@ def tiktok_source(
|
|
|
120
121
|
else None
|
|
121
122
|
),
|
|
122
123
|
) -> Iterable[TDataItem]:
|
|
123
|
-
|
|
124
|
+
start_date_tz_adjusted = start_date.in_tz(timezone)
|
|
125
|
+
end_date_tz_adjusted = end_date.in_tz(timezone)
|
|
124
126
|
|
|
125
127
|
if datetime is not None:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
+
start_date_tz_adjusted = ensure_pendulum_datetime(
|
|
129
|
+
datetime.last_value
|
|
130
|
+
).in_tz(timezone)
|
|
131
|
+
end_date_tz_adjusted = ensure_pendulum_datetime(datetime.end_value).in_tz(
|
|
132
|
+
timezone
|
|
133
|
+
)
|
|
128
134
|
|
|
129
135
|
list_of_interval = find_intervals(
|
|
130
|
-
current_date=
|
|
131
|
-
end_date=
|
|
136
|
+
current_date=start_date_tz_adjusted,
|
|
137
|
+
end_date=end_date_tz_adjusted,
|
|
132
138
|
interval_days=interval_days,
|
|
133
139
|
)
|
|
134
140
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Trustpilot source for ingesting reviews."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Generator, Iterable
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
import pendulum
|
|
7
|
+
from dlt.sources import DltResource
|
|
8
|
+
|
|
9
|
+
from .client import TrustpilotClient
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dlt.source()
|
|
13
|
+
def trustpilot_source(
|
|
14
|
+
business_unit_id: str,
|
|
15
|
+
start_date: str,
|
|
16
|
+
end_date: str | None,
|
|
17
|
+
api_key: str,
|
|
18
|
+
per_page: int = 1000,
|
|
19
|
+
) -> Iterable[DltResource]:
|
|
20
|
+
"""Return resources for Trustpilot."""
|
|
21
|
+
|
|
22
|
+
client = TrustpilotClient(api_key=api_key)
|
|
23
|
+
|
|
24
|
+
@dlt.resource(name="reviews", write_disposition="merge", primary_key="id")
|
|
25
|
+
def reviews(
|
|
26
|
+
dateTime=(
|
|
27
|
+
dlt.sources.incremental(
|
|
28
|
+
"updated_at",
|
|
29
|
+
initial_value=start_date,
|
|
30
|
+
end_value=end_date,
|
|
31
|
+
range_start="closed",
|
|
32
|
+
range_end="closed",
|
|
33
|
+
)
|
|
34
|
+
),
|
|
35
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
36
|
+
if end_date is None:
|
|
37
|
+
end_dt = pendulum.now(tz="UTC").isoformat()
|
|
38
|
+
else:
|
|
39
|
+
end_dt = dateTime.end_value
|
|
40
|
+
start_dt = dateTime.last_value
|
|
41
|
+
yield from client.paginated_reviews(
|
|
42
|
+
business_unit_id=business_unit_id,
|
|
43
|
+
per_page=per_page,
|
|
44
|
+
updated_since=start_dt,
|
|
45
|
+
end_date=end_dt,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
yield reviews
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Simple Trustpilot API client."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Iterable
|
|
4
|
+
|
|
5
|
+
import pendulum
|
|
6
|
+
from dlt.sources.helpers import requests
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TrustpilotClient:
|
|
10
|
+
"""Client for the Trustpilot public API."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, api_key: str) -> None:
|
|
13
|
+
self.api_key = api_key
|
|
14
|
+
self.base_url = "https://api.trustpilot.com/v1"
|
|
15
|
+
|
|
16
|
+
def _get(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]:
|
|
17
|
+
params = dict(params)
|
|
18
|
+
params["apikey"] = self.api_key
|
|
19
|
+
response = requests.get(f"{self.base_url}{endpoint}", params=params)
|
|
20
|
+
response.raise_for_status()
|
|
21
|
+
return response.json()
|
|
22
|
+
|
|
23
|
+
def paginated_reviews(
|
|
24
|
+
self,
|
|
25
|
+
business_unit_id: str,
|
|
26
|
+
updated_since: str,
|
|
27
|
+
end_date: str,
|
|
28
|
+
per_page: int = 1000,
|
|
29
|
+
) -> Iterable[Dict[str, Any]]:
|
|
30
|
+
page = 1
|
|
31
|
+
while True:
|
|
32
|
+
params: Dict[str, Any] = {"perPage": per_page, "page": page}
|
|
33
|
+
if updated_since:
|
|
34
|
+
params["updatedSince"] = updated_since
|
|
35
|
+
data = self._get(f"/business-units/{business_unit_id}/reviews", params)
|
|
36
|
+
reviews = data.get("reviews", data)
|
|
37
|
+
if not reviews:
|
|
38
|
+
break
|
|
39
|
+
for review in reviews:
|
|
40
|
+
end_date_dt = pendulum.parse(end_date)
|
|
41
|
+
review["updated_at"] = review["updatedAt"]
|
|
42
|
+
review_dt = pendulum.parse(review["updated_at"])
|
|
43
|
+
if review_dt > end_date_dt: # type: ignore
|
|
44
|
+
continue
|
|
45
|
+
yield review
|
|
46
|
+
if len(reviews) < per_page:
|
|
47
|
+
break
|
|
48
|
+
page += 1
|