omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import text
|
|
4
|
+
from sqlalchemy import types as sa
|
|
5
|
+
from sqlalchemy.dialects import mysql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def type_adapter_callback(sql_type):
|
|
9
|
+
if isinstance(sql_type, mysql.SET):
|
|
10
|
+
return sa.JSON
|
|
11
|
+
return sql_type
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chained_query_adapter_callback(query_adapters):
|
|
15
|
+
"""
|
|
16
|
+
This function is used to chain multiple query adapters together,.
|
|
17
|
+
This gives us the flexibility to introduce various adapters based on the given command parameters.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def callback(query, table):
|
|
21
|
+
for adapter in query_adapters:
|
|
22
|
+
query = adapter(query, table)
|
|
23
|
+
|
|
24
|
+
return query
|
|
25
|
+
|
|
26
|
+
return callback
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def limit_callback(sql_limit: int, incremental_key: str):
|
|
30
|
+
def callback(query, table):
|
|
31
|
+
query = query.limit(sql_limit)
|
|
32
|
+
if incremental_key:
|
|
33
|
+
query = query.order_by(incremental_key)
|
|
34
|
+
return query
|
|
35
|
+
|
|
36
|
+
return callback
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def custom_query_variable_subsitution(query_value: str, kwargs: dict):
|
|
40
|
+
def callback(query, table, incremental=None, engine=None):
|
|
41
|
+
params = {}
|
|
42
|
+
if incremental:
|
|
43
|
+
params["interval_start"] = (
|
|
44
|
+
incremental.last_value
|
|
45
|
+
if incremental.last_value is not None
|
|
46
|
+
else datetime(year=1, month=1, day=1)
|
|
47
|
+
)
|
|
48
|
+
if incremental.end_value is not None:
|
|
49
|
+
params["interval_end"] = incremental.end_value
|
|
50
|
+
else:
|
|
51
|
+
if ":interval_start" in query_value:
|
|
52
|
+
params["interval_start"] = (
|
|
53
|
+
datetime.min
|
|
54
|
+
if kwargs.get("interval_start") is None
|
|
55
|
+
else kwargs.get("interval_start")
|
|
56
|
+
)
|
|
57
|
+
if ":interval_end" in query_value:
|
|
58
|
+
params["interval_end"] = (
|
|
59
|
+
datetime.max
|
|
60
|
+
if kwargs.get("interval_end") is None
|
|
61
|
+
else kwargs.get("interval_end")
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return text(query_value).bindparams(**params)
|
|
65
|
+
|
|
66
|
+
return callback
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""This source uses Stripe API and dlt to load data such as Customer, Subscription, Event etc. to the database and to calculate the MRR and churn rate."""
|
|
16
|
+
|
|
17
|
+
from typing import Any, Dict, Generator, Iterable, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
import dlt
|
|
20
|
+
import stripe
|
|
21
|
+
from dlt.sources import DltResource
|
|
22
|
+
from pendulum import DateTime
|
|
23
|
+
|
|
24
|
+
from .helpers import (
|
|
25
|
+
async_parallel_pagination,
|
|
26
|
+
generate_date_ranges,
|
|
27
|
+
pagination,
|
|
28
|
+
transform_date,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dlt.source(max_table_nesting=0)
|
|
33
|
+
def stripe_source(
|
|
34
|
+
endpoints: Tuple[str, ...],
|
|
35
|
+
stripe_secret_key: str = dlt.secrets.value,
|
|
36
|
+
start_date: Optional[DateTime] = None,
|
|
37
|
+
end_date: Optional[DateTime] = None,
|
|
38
|
+
) -> Iterable[DltResource]:
|
|
39
|
+
"""
|
|
40
|
+
Retrieves data from the Stripe API for the specified endpoints.
|
|
41
|
+
|
|
42
|
+
For all endpoints, Stripe API responses do not provide the key "updated",
|
|
43
|
+
so in most cases, we are forced to load the data in 'replace' mode.
|
|
44
|
+
This source is suitable for all types of endpoints, including 'Events', 'Invoice', etc.
|
|
45
|
+
but these endpoints can also be loaded in incremental mode (see source incremental_stripe_source).
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from. Defaults to most popular Stripe API endpoints.
|
|
49
|
+
stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
|
|
50
|
+
start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to None.
|
|
51
|
+
end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to None.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Iterable[DltResource]: Resources with data that was created during the period greater than or equal to 'start_date' and less than 'end_date'.
|
|
55
|
+
"""
|
|
56
|
+
stripe.api_key = stripe_secret_key
|
|
57
|
+
stripe.api_version = "2022-11-15"
|
|
58
|
+
|
|
59
|
+
def stripe_resource(
|
|
60
|
+
endpoint: str,
|
|
61
|
+
) -> Generator[Dict[Any, Any], Any, None]:
|
|
62
|
+
yield from pagination(endpoint, start_date, end_date)
|
|
63
|
+
|
|
64
|
+
for endpoint in endpoints:
|
|
65
|
+
yield dlt.resource(
|
|
66
|
+
stripe_resource,
|
|
67
|
+
name=endpoint,
|
|
68
|
+
write_disposition="replace",
|
|
69
|
+
)(endpoint)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dlt.source(max_table_nesting=0)
|
|
73
|
+
def async_stripe_source(
|
|
74
|
+
endpoints: Tuple[str, ...],
|
|
75
|
+
stripe_secret_key: str = dlt.secrets.value,
|
|
76
|
+
start_date: Optional[DateTime] = None,
|
|
77
|
+
end_date: Optional[DateTime] = None,
|
|
78
|
+
max_workers: int = 4,
|
|
79
|
+
rate_limit_delay: float = 0.03,
|
|
80
|
+
) -> Iterable[DltResource]:
|
|
81
|
+
"""
|
|
82
|
+
ULTRA-FAST async Stripe source optimized for maximum speed and throughput.
|
|
83
|
+
|
|
84
|
+
WARNING: Returns data in RANDOM ORDER for maximum performance.
|
|
85
|
+
Uses aggressive concurrency and minimal delays to maximize API throughput.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
|
|
89
|
+
stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
|
|
90
|
+
start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to 2010-01-01.
|
|
91
|
+
end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to today.
|
|
92
|
+
max_workers (int): Maximum number of concurrent async tasks. Defaults to 40 for maximum speed.
|
|
93
|
+
rate_limit_delay (float): Minimal delay between requests. Defaults to 0.03 seconds.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Iterable[DltResource]: Resources with data in RANDOM ORDER (optimized for speed).
|
|
97
|
+
"""
|
|
98
|
+
stripe.api_key = stripe_secret_key
|
|
99
|
+
stripe.api_version = "2022-11-15"
|
|
100
|
+
|
|
101
|
+
async def async_stripe_resource(endpoint: str):
|
|
102
|
+
yield async_parallel_pagination(endpoint, max_workers, rate_limit_delay)
|
|
103
|
+
|
|
104
|
+
for endpoint in endpoints:
|
|
105
|
+
yield dlt.resource(
|
|
106
|
+
async_stripe_resource,
|
|
107
|
+
name=endpoint,
|
|
108
|
+
write_disposition="replace",
|
|
109
|
+
)(endpoint)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dlt.source(max_table_nesting=0)
|
|
113
|
+
def incremental_stripe_source(
|
|
114
|
+
endpoints: Tuple[str, ...],
|
|
115
|
+
stripe_secret_key: str = dlt.secrets.value,
|
|
116
|
+
initial_start_date: Optional[DateTime] = None,
|
|
117
|
+
end_date: Optional[DateTime] = None,
|
|
118
|
+
) -> Iterable[DltResource]:
|
|
119
|
+
stripe.api_key = stripe_secret_key
|
|
120
|
+
stripe.api_version = "2022-11-15"
|
|
121
|
+
start_date_unix = (
|
|
122
|
+
transform_date(initial_start_date) if initial_start_date is not None else -1
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
for endpoint in endpoints:
|
|
126
|
+
|
|
127
|
+
def date_range_resource(
|
|
128
|
+
endpoint: str = endpoint,
|
|
129
|
+
created: Optional[Any] = dlt.sources.incremental(
|
|
130
|
+
"created",
|
|
131
|
+
initial_value=start_date_unix,
|
|
132
|
+
end_value=transform_date(end_date) if end_date is not None else None,
|
|
133
|
+
range_end="closed",
|
|
134
|
+
range_start="closed",
|
|
135
|
+
),
|
|
136
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
137
|
+
from dlt.common import pendulum
|
|
138
|
+
|
|
139
|
+
# Use 2010-01-01 as default start (Stripe founding year) to avoid
|
|
140
|
+
# generating hundreds of thousands of hourly ranges from 1969
|
|
141
|
+
default_start_ts = int(pendulum.datetime(2010, 1, 1).timestamp())
|
|
142
|
+
start_ts = (
|
|
143
|
+
created.last_value
|
|
144
|
+
if created.last_value is not None
|
|
145
|
+
else start_date_unix
|
|
146
|
+
)
|
|
147
|
+
if start_ts < 0:
|
|
148
|
+
start_ts = default_start_ts
|
|
149
|
+
end_ts = (
|
|
150
|
+
created.end_value
|
|
151
|
+
if created.end_value is not None
|
|
152
|
+
else int(pendulum.now().timestamp())
|
|
153
|
+
)
|
|
154
|
+
for date_range in generate_date_ranges(start_ts, end_ts):
|
|
155
|
+
date_range["endpoint"] = endpoint
|
|
156
|
+
date_range["created"] = date_range["end_ts"]
|
|
157
|
+
yield date_range
|
|
158
|
+
|
|
159
|
+
def fetch_date_range(
|
|
160
|
+
date_range: Dict[str, int],
|
|
161
|
+
) -> Generator[Dict[Any, Any], Any, None]:
|
|
162
|
+
"""Transformer that fetches data for a given date range."""
|
|
163
|
+
yield from pagination(
|
|
164
|
+
date_range["endpoint"],
|
|
165
|
+
start_date=date_range["start_ts"],
|
|
166
|
+
end_date=date_range["end_ts"],
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
date_ranges = dlt.resource(
|
|
170
|
+
date_range_resource,
|
|
171
|
+
name=f"{endpoint}_date_ranges",
|
|
172
|
+
)()
|
|
173
|
+
|
|
174
|
+
yield (
|
|
175
|
+
date_ranges
|
|
176
|
+
| dlt.transformer(
|
|
177
|
+
fetch_date_range,
|
|
178
|
+
name=endpoint,
|
|
179
|
+
write_disposition="merge",
|
|
180
|
+
primary_key="id",
|
|
181
|
+
parallelized=True,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Stripe analytics source helpers"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import math
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
21
|
+
|
|
22
|
+
import stripe
|
|
23
|
+
from dlt.common import pendulum
|
|
24
|
+
from dlt.common.typing import TDataItem
|
|
25
|
+
from pendulum import DateTime
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def pagination(
|
|
29
|
+
endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
|
|
30
|
+
) -> Iterable[TDataItem]:
|
|
31
|
+
"""
|
|
32
|
+
Retrieves data from an endpoint with pagination.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
endpoint (str): The endpoint to retrieve data from.
|
|
36
|
+
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
|
|
37
|
+
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Iterable[TDataItem]: Data items retrieved from the endpoint.
|
|
41
|
+
"""
|
|
42
|
+
starting_after = None
|
|
43
|
+
while True:
|
|
44
|
+
response = stripe_get_data(
|
|
45
|
+
endpoint,
|
|
46
|
+
start_date=start_date,
|
|
47
|
+
end_date=end_date,
|
|
48
|
+
starting_after=starting_after,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if len(response["data"]) > 0:
|
|
52
|
+
starting_after = response["data"][-1]["id"]
|
|
53
|
+
yield response["data"]
|
|
54
|
+
|
|
55
|
+
if not response["has_more"]:
|
|
56
|
+
break
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
|
|
60
|
+
"""
|
|
61
|
+
Divide a time range into equal chunks for parallel processing.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
start_ts (int): Start timestamp
|
|
65
|
+
end_ts (int): End timestamp
|
|
66
|
+
num_chunks (int): Number of chunks to create
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
|
|
70
|
+
"""
|
|
71
|
+
total_duration = end_ts - start_ts
|
|
72
|
+
chunk_duration = math.ceil(total_duration / num_chunks)
|
|
73
|
+
|
|
74
|
+
chunks = []
|
|
75
|
+
current_start = start_ts
|
|
76
|
+
|
|
77
|
+
for i in range(num_chunks):
|
|
78
|
+
current_end = min(current_start + chunk_duration, end_ts)
|
|
79
|
+
if current_start < end_ts:
|
|
80
|
+
chunks.append((current_start, current_end))
|
|
81
|
+
current_start = current_end
|
|
82
|
+
|
|
83
|
+
if current_start >= end_ts:
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
return chunks
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _create_adaptive_time_chunks(
|
|
90
|
+
start_ts: int, end_ts: int, max_workers: int
|
|
91
|
+
) -> List[tuple]:
|
|
92
|
+
"""
|
|
93
|
+
Create time chunks with adaptive sizing - larger chunks for 2010s (less data expected).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
start_ts (int): Start timestamp
|
|
97
|
+
end_ts (int): End timestamp
|
|
98
|
+
max_workers (int): Maximum number of workers
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
|
|
102
|
+
"""
|
|
103
|
+
chunks = []
|
|
104
|
+
|
|
105
|
+
# Key timestamps
|
|
106
|
+
year_2020_ts = int(pendulum.datetime(2020, 1, 1).timestamp())
|
|
107
|
+
year_2015_ts = int(pendulum.datetime(2015, 1, 1).timestamp())
|
|
108
|
+
|
|
109
|
+
current_start = start_ts
|
|
110
|
+
|
|
111
|
+
# Handle 2010-2015: Large chunks (2-3 year periods)
|
|
112
|
+
if current_start < year_2015_ts:
|
|
113
|
+
chunk_end = min(year_2015_ts, end_ts)
|
|
114
|
+
if current_start < chunk_end:
|
|
115
|
+
# Split 2010-2015 into 2-3 chunks max
|
|
116
|
+
pre_2015_chunks = _create_time_chunks(
|
|
117
|
+
current_start, chunk_end, min(3, max_workers)
|
|
118
|
+
)
|
|
119
|
+
chunks.extend(pre_2015_chunks)
|
|
120
|
+
current_start = chunk_end
|
|
121
|
+
|
|
122
|
+
# Handle 2015-2020: Medium chunks (6 month to 1 year periods)
|
|
123
|
+
if current_start < year_2020_ts and current_start < end_ts:
|
|
124
|
+
chunk_end = min(year_2020_ts, end_ts)
|
|
125
|
+
if current_start < chunk_end:
|
|
126
|
+
# Split 2015-2020 into smaller chunks
|
|
127
|
+
duration_2015_2020 = chunk_end - current_start
|
|
128
|
+
years_2015_2020 = duration_2015_2020 / (365 * 24 * 60 * 60)
|
|
129
|
+
num_chunks_2015_2020 = min(
|
|
130
|
+
max_workers, max(2, int(years_2015_2020 * 2))
|
|
131
|
+
) # ~6 months per chunk
|
|
132
|
+
|
|
133
|
+
pre_2020_chunks = _create_time_chunks(
|
|
134
|
+
current_start, chunk_end, num_chunks_2015_2020
|
|
135
|
+
)
|
|
136
|
+
chunks.extend(pre_2020_chunks)
|
|
137
|
+
current_start = chunk_end
|
|
138
|
+
|
|
139
|
+
if current_start < end_ts:
|
|
140
|
+
# Split post-2020 data into daily chunks for maximum granularity
|
|
141
|
+
current_chunk_start = current_start
|
|
142
|
+
while current_chunk_start < end_ts:
|
|
143
|
+
# Calculate end of current day
|
|
144
|
+
current_date = datetime.fromtimestamp(current_chunk_start)
|
|
145
|
+
next_day = current_date + timedelta(days=1)
|
|
146
|
+
chunk_end = min(int(next_day.timestamp()), end_ts)
|
|
147
|
+
|
|
148
|
+
chunks.append((current_chunk_start, chunk_end))
|
|
149
|
+
current_chunk_start = chunk_end
|
|
150
|
+
|
|
151
|
+
return chunks
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _fetch_chunk_data_streaming(
|
|
155
|
+
endpoint: str, start_ts: int, end_ts: int
|
|
156
|
+
) -> List[List[TDataItem]]:
|
|
157
|
+
"""
|
|
158
|
+
Fetch data for a specific time chunk using sequential pagination with memory-efficient approach.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
endpoint (str): The Stripe endpoint to fetch from
|
|
162
|
+
start_ts (int): Start timestamp for this chunk
|
|
163
|
+
end_ts (int): End timestamp for this chunk
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List[List[TDataItem]]: List of batches of data items
|
|
167
|
+
"""
|
|
168
|
+
# For streaming, we still need to collect the chunk data to maintain order
|
|
169
|
+
# but we can optimize by not holding all data in memory at once
|
|
170
|
+
print(
|
|
171
|
+
f"Fetching chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
|
|
172
|
+
)
|
|
173
|
+
chunk_data = []
|
|
174
|
+
batch_count = 0
|
|
175
|
+
|
|
176
|
+
for batch in pagination(endpoint, start_ts, end_ts):
|
|
177
|
+
chunk_data.append(batch)
|
|
178
|
+
print(
|
|
179
|
+
f"Processed {batch_count} batches for chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
|
|
180
|
+
)
|
|
181
|
+
batch_count += 1
|
|
182
|
+
|
|
183
|
+
return chunk_data
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
async def async_pagination(
|
|
187
|
+
endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
|
|
188
|
+
) -> Iterable[TDataItem]:
|
|
189
|
+
"""
|
|
190
|
+
Async version of pagination that retrieves data from an endpoint with pagination.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
endpoint (str): The endpoint to retrieve data from.
|
|
194
|
+
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
|
|
195
|
+
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Iterable[TDataItem]: Data items retrieved from the endpoint.
|
|
199
|
+
"""
|
|
200
|
+
starting_after = None
|
|
201
|
+
while True:
|
|
202
|
+
response = await stripe_get_data_async(
|
|
203
|
+
endpoint,
|
|
204
|
+
start_date=start_date,
|
|
205
|
+
end_date=end_date,
|
|
206
|
+
starting_after=starting_after,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if len(response["data"]) > 0:
|
|
210
|
+
starting_after = response["data"][-1]["id"]
|
|
211
|
+
yield response["data"]
|
|
212
|
+
|
|
213
|
+
if not response["has_more"]:
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
async def async_parallel_pagination(
|
|
218
|
+
endpoint: str,
|
|
219
|
+
max_workers: int = 8,
|
|
220
|
+
rate_limit_delay: float = 5,
|
|
221
|
+
) -> Iterable[TDataItem]:
|
|
222
|
+
"""
|
|
223
|
+
ULTRA-FAST async parallel pagination - yields data in random order for maximum speed.
|
|
224
|
+
No ordering constraints - pure performance optimization.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
endpoint (str): The endpoint to retrieve data from.
|
|
228
|
+
start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
|
|
229
|
+
end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
|
|
230
|
+
max_workers (int): Maximum number of concurrent async tasks. Defaults to 8 for balanced speed/rate limit respect.
|
|
231
|
+
rate_limit_delay (float): Minimal delay between requests. Defaults to 5 seconds.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Iterable[TDataItem]: Data items retrieved from the endpoint (RANDOM ORDER FOR SPEED).
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
start_date = pendulum.datetime(2010, 1, 1)
|
|
238
|
+
end_date = pendulum.now()
|
|
239
|
+
start_ts = transform_date(start_date)
|
|
240
|
+
end_ts = transform_date(end_date)
|
|
241
|
+
|
|
242
|
+
# Create time chunks with larger chunks for 2010s (less data expected)
|
|
243
|
+
time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
|
|
244
|
+
|
|
245
|
+
# Use asyncio semaphore to control concurrency and respect rate limits
|
|
246
|
+
semaphore = asyncio.Semaphore(max_workers)
|
|
247
|
+
|
|
248
|
+
async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
|
|
249
|
+
async with semaphore:
|
|
250
|
+
return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
|
|
251
|
+
|
|
252
|
+
# Create all tasks
|
|
253
|
+
tasks = [
|
|
254
|
+
fetch_chunk_with_semaphore(chunk_start, chunk_end)
|
|
255
|
+
for chunk_start, chunk_end in time_chunks
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
for coro in asyncio.as_completed(tasks):
|
|
259
|
+
try:
|
|
260
|
+
chunk_data = await coro
|
|
261
|
+
|
|
262
|
+
for batch in chunk_data:
|
|
263
|
+
yield batch
|
|
264
|
+
|
|
265
|
+
except Exception as exc:
|
|
266
|
+
print(f"Async chunk processing generated an exception: {exc}")
|
|
267
|
+
raise exc
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
async def _fetch_chunk_data_async_fast(
|
|
271
|
+
endpoint: str, start_ts: int, end_ts: int
|
|
272
|
+
) -> List[List[TDataItem]]:
|
|
273
|
+
"""
|
|
274
|
+
ULTRA-FAST async chunk fetcher - no metadata overhead, direct data return.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
endpoint (str): The Stripe endpoint to fetch from
|
|
278
|
+
start_ts (int): Start timestamp for this chunk
|
|
279
|
+
end_ts (int): End timestamp for this chunk
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List[List[TDataItem]]: Raw batches with zero overhead
|
|
283
|
+
"""
|
|
284
|
+
chunk_data = []
|
|
285
|
+
async for batch in async_pagination(endpoint, start_ts, end_ts):
|
|
286
|
+
chunk_data.append(batch)
|
|
287
|
+
|
|
288
|
+
return chunk_data
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def generate_date_ranges(start_ts: int, end_ts: int) -> Iterable[Dict[str, int]]:
|
|
292
|
+
"""Generate hourly date range dicts for parallel processing.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
start_ts (int): Start timestamp (unix)
|
|
296
|
+
end_ts (int): End timestamp (unix)
|
|
297
|
+
|
|
298
|
+
Yields:
|
|
299
|
+
Dict[str, int]: Dictionary with 'start_ts' and 'end_ts' keys for each hour
|
|
300
|
+
"""
|
|
301
|
+
current_ts = start_ts
|
|
302
|
+
|
|
303
|
+
while current_ts < end_ts:
|
|
304
|
+
next_hour = (current_ts // 3600 + 1) * 3600
|
|
305
|
+
next_ts = min(next_hour, end_ts)
|
|
306
|
+
yield {"start_ts": current_ts, "end_ts": next_ts}
|
|
307
|
+
current_ts = next_ts
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def transform_date(date: Union[str, DateTime, int]) -> int:
|
|
311
|
+
if isinstance(date, str):
|
|
312
|
+
date = pendulum.from_format(date, "%Y-%m-%dT%H:%M:%SZ")
|
|
313
|
+
if isinstance(date, DateTime):
|
|
314
|
+
# convert to unix timestamp
|
|
315
|
+
date = int(date.timestamp())
|
|
316
|
+
return date
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def stripe_get_data(
|
|
320
|
+
resource: str,
|
|
321
|
+
start_date: Optional[Any] = None,
|
|
322
|
+
end_date: Optional[Any] = None,
|
|
323
|
+
**kwargs: Any,
|
|
324
|
+
) -> Dict[Any, Any]:
|
|
325
|
+
if start_date:
|
|
326
|
+
start_date = transform_date(start_date)
|
|
327
|
+
if end_date:
|
|
328
|
+
end_date = transform_date(end_date)
|
|
329
|
+
|
|
330
|
+
if resource == "Subscription":
|
|
331
|
+
kwargs.update({"status": "all"})
|
|
332
|
+
|
|
333
|
+
resource_dict = getattr(stripe, resource).list(
|
|
334
|
+
created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
|
|
335
|
+
)
|
|
336
|
+
return dict(resource_dict)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
async def stripe_get_data_async(
|
|
340
|
+
resource: str,
|
|
341
|
+
start_date: Optional[Any] = None,
|
|
342
|
+
end_date: Optional[Any] = None,
|
|
343
|
+
**kwargs: Any,
|
|
344
|
+
) -> Dict[Any, Any]:
|
|
345
|
+
"""Async version of stripe_get_data"""
|
|
346
|
+
if start_date:
|
|
347
|
+
start_date = transform_date(start_date)
|
|
348
|
+
if end_date:
|
|
349
|
+
end_date = transform_date(end_date)
|
|
350
|
+
|
|
351
|
+
if resource == "Subscription":
|
|
352
|
+
kwargs.update({"status": "all"})
|
|
353
|
+
|
|
354
|
+
import asyncio
|
|
355
|
+
|
|
356
|
+
from stripe import RateLimitError
|
|
357
|
+
|
|
358
|
+
max_retries = 50
|
|
359
|
+
retry_count = 0
|
|
360
|
+
max_wait_time_ms = 10000
|
|
361
|
+
|
|
362
|
+
while retry_count < max_retries:
|
|
363
|
+
# print(
|
|
364
|
+
# f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
|
|
365
|
+
# flush=True,
|
|
366
|
+
# )
|
|
367
|
+
try:
|
|
368
|
+
resource_dict = await getattr(stripe, resource).list_async(
|
|
369
|
+
created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
|
|
370
|
+
)
|
|
371
|
+
return dict(resource_dict)
|
|
372
|
+
except RateLimitError:
|
|
373
|
+
retry_count += 1
|
|
374
|
+
if retry_count < max_retries:
|
|
375
|
+
wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
|
|
376
|
+
print(
|
|
377
|
+
f"Got rate limited, sleeping {wait_time} seconds before retrying...",
|
|
378
|
+
flush=True,
|
|
379
|
+
)
|
|
380
|
+
await asyncio.sleep(wait_time)
|
|
381
|
+
else:
|
|
382
|
+
# Re-raise the last exception if we've exhausted retries
|
|
383
|
+
print(f"✗ Failed to fetch {resource} after {max_retries} retries")
|
|
384
|
+
raise
|
|
385
|
+
|
|
386
|
+
return dict(resource_dict)
|