omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
File without changes
@@ -0,0 +1,66 @@
1
+ from datetime import datetime
2
+
3
+ from sqlalchemy import text
4
+ from sqlalchemy import types as sa
5
+ from sqlalchemy.dialects import mysql
6
+
7
+
8
+ def type_adapter_callback(sql_type):
9
+ if isinstance(sql_type, mysql.SET):
10
+ return sa.JSON
11
+ return sql_type
12
+
13
+
14
+ def chained_query_adapter_callback(query_adapters):
15
+ """
16
+ This function is used to chain multiple query adapters together,.
17
+ This gives us the flexibility to introduce various adapters based on the given command parameters.
18
+ """
19
+
20
+ def callback(query, table):
21
+ for adapter in query_adapters:
22
+ query = adapter(query, table)
23
+
24
+ return query
25
+
26
+ return callback
27
+
28
+
29
+ def limit_callback(sql_limit: int, incremental_key: str):
30
+ def callback(query, table):
31
+ query = query.limit(sql_limit)
32
+ if incremental_key:
33
+ query = query.order_by(incremental_key)
34
+ return query
35
+
36
+ return callback
37
+
38
+
39
+ def custom_query_variable_subsitution(query_value: str, kwargs: dict):
40
+ def callback(query, table, incremental=None, engine=None):
41
+ params = {}
42
+ if incremental:
43
+ params["interval_start"] = (
44
+ incremental.last_value
45
+ if incremental.last_value is not None
46
+ else datetime(year=1, month=1, day=1)
47
+ )
48
+ if incremental.end_value is not None:
49
+ params["interval_end"] = incremental.end_value
50
+ else:
51
+ if ":interval_start" in query_value:
52
+ params["interval_start"] = (
53
+ datetime.min
54
+ if kwargs.get("interval_start") is None
55
+ else kwargs.get("interval_start")
56
+ )
57
+ if ":interval_end" in query_value:
58
+ params["interval_end"] = (
59
+ datetime.max
60
+ if kwargs.get("interval_end") is None
61
+ else kwargs.get("interval_end")
62
+ )
63
+
64
+ return text(query_value).bindparams(**params)
65
+
66
+ return callback
@@ -0,0 +1,183 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """This source uses Stripe API and dlt to load data such as Customer, Subscription, Event etc. to the database and to calculate the MRR and churn rate."""
16
+
17
+ from typing import Any, Dict, Generator, Iterable, Optional, Tuple
18
+
19
+ import dlt
20
+ import stripe
21
+ from dlt.sources import DltResource
22
+ from pendulum import DateTime
23
+
24
+ from .helpers import (
25
+ async_parallel_pagination,
26
+ generate_date_ranges,
27
+ pagination,
28
+ transform_date,
29
+ )
30
+
31
+
32
+ @dlt.source(max_table_nesting=0)
33
+ def stripe_source(
34
+ endpoints: Tuple[str, ...],
35
+ stripe_secret_key: str = dlt.secrets.value,
36
+ start_date: Optional[DateTime] = None,
37
+ end_date: Optional[DateTime] = None,
38
+ ) -> Iterable[DltResource]:
39
+ """
40
+ Retrieves data from the Stripe API for the specified endpoints.
41
+
42
+ For all endpoints, Stripe API responses do not provide the key "updated",
43
+ so in most cases, we are forced to load the data in 'replace' mode.
44
+ This source is suitable for all types of endpoints, including 'Events', 'Invoice', etc.
45
+ but these endpoints can also be loaded in incremental mode (see source incremental_stripe_source).
46
+
47
+ Args:
48
+ endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from. Defaults to most popular Stripe API endpoints.
49
+ stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
50
+ start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to None.
51
+ end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to None.
52
+
53
+ Returns:
54
+ Iterable[DltResource]: Resources with data that was created during the period greater than or equal to 'start_date' and less than 'end_date'.
55
+ """
56
+ stripe.api_key = stripe_secret_key
57
+ stripe.api_version = "2022-11-15"
58
+
59
+ def stripe_resource(
60
+ endpoint: str,
61
+ ) -> Generator[Dict[Any, Any], Any, None]:
62
+ yield from pagination(endpoint, start_date, end_date)
63
+
64
+ for endpoint in endpoints:
65
+ yield dlt.resource(
66
+ stripe_resource,
67
+ name=endpoint,
68
+ write_disposition="replace",
69
+ )(endpoint)
70
+
71
+
72
+ @dlt.source(max_table_nesting=0)
73
+ def async_stripe_source(
74
+ endpoints: Tuple[str, ...],
75
+ stripe_secret_key: str = dlt.secrets.value,
76
+ start_date: Optional[DateTime] = None,
77
+ end_date: Optional[DateTime] = None,
78
+ max_workers: int = 4,
79
+ rate_limit_delay: float = 0.03,
80
+ ) -> Iterable[DltResource]:
81
+ """
82
+ ULTRA-FAST async Stripe source optimized for maximum speed and throughput.
83
+
84
+ WARNING: Returns data in RANDOM ORDER for maximum performance.
85
+ Uses aggressive concurrency and minimal delays to maximize API throughput.
86
+
87
+ Args:
88
+ endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
89
+ stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
90
+ start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to 2010-01-01.
91
+ end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to today.
92
+ max_workers (int): Maximum number of concurrent async tasks. Defaults to 40 for maximum speed.
93
+ rate_limit_delay (float): Minimal delay between requests. Defaults to 0.03 seconds.
94
+
95
+ Returns:
96
+ Iterable[DltResource]: Resources with data in RANDOM ORDER (optimized for speed).
97
+ """
98
+ stripe.api_key = stripe_secret_key
99
+ stripe.api_version = "2022-11-15"
100
+
101
+ async def async_stripe_resource(endpoint: str):
102
+ yield async_parallel_pagination(endpoint, max_workers, rate_limit_delay)
103
+
104
+ for endpoint in endpoints:
105
+ yield dlt.resource(
106
+ async_stripe_resource,
107
+ name=endpoint,
108
+ write_disposition="replace",
109
+ )(endpoint)
110
+
111
+
112
+ @dlt.source(max_table_nesting=0)
113
+ def incremental_stripe_source(
114
+ endpoints: Tuple[str, ...],
115
+ stripe_secret_key: str = dlt.secrets.value,
116
+ initial_start_date: Optional[DateTime] = None,
117
+ end_date: Optional[DateTime] = None,
118
+ ) -> Iterable[DltResource]:
119
+ stripe.api_key = stripe_secret_key
120
+ stripe.api_version = "2022-11-15"
121
+ start_date_unix = (
122
+ transform_date(initial_start_date) if initial_start_date is not None else -1
123
+ )
124
+
125
+ for endpoint in endpoints:
126
+
127
+ def date_range_resource(
128
+ endpoint: str = endpoint,
129
+ created: Optional[Any] = dlt.sources.incremental(
130
+ "created",
131
+ initial_value=start_date_unix,
132
+ end_value=transform_date(end_date) if end_date is not None else None,
133
+ range_end="closed",
134
+ range_start="closed",
135
+ ),
136
+ ) -> Generator[Dict[str, Any], None, None]:
137
+ from dlt.common import pendulum
138
+
139
+ # Use 2010-01-01 as default start (Stripe founding year) to avoid
140
+ # generating hundreds of thousands of hourly ranges from 1969
141
+ default_start_ts = int(pendulum.datetime(2010, 1, 1).timestamp())
142
+ start_ts = (
143
+ created.last_value
144
+ if created.last_value is not None
145
+ else start_date_unix
146
+ )
147
+ if start_ts < 0:
148
+ start_ts = default_start_ts
149
+ end_ts = (
150
+ created.end_value
151
+ if created.end_value is not None
152
+ else int(pendulum.now().timestamp())
153
+ )
154
+ for date_range in generate_date_ranges(start_ts, end_ts):
155
+ date_range["endpoint"] = endpoint
156
+ date_range["created"] = date_range["end_ts"]
157
+ yield date_range
158
+
159
+ def fetch_date_range(
160
+ date_range: Dict[str, int],
161
+ ) -> Generator[Dict[Any, Any], Any, None]:
162
+ """Transformer that fetches data for a given date range."""
163
+ yield from pagination(
164
+ date_range["endpoint"],
165
+ start_date=date_range["start_ts"],
166
+ end_date=date_range["end_ts"],
167
+ )
168
+
169
+ date_ranges = dlt.resource(
170
+ date_range_resource,
171
+ name=f"{endpoint}_date_ranges",
172
+ )()
173
+
174
+ yield (
175
+ date_ranges
176
+ | dlt.transformer(
177
+ fetch_date_range,
178
+ name=endpoint,
179
+ write_disposition="merge",
180
+ primary_key="id",
181
+ parallelized=True,
182
+ )
183
+ )
@@ -0,0 +1,386 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Stripe analytics source helpers"""
16
+
17
+ import asyncio
18
+ import math
19
+ from datetime import datetime, timedelta
20
+ from typing import Any, Dict, Iterable, List, Optional, Union
21
+
22
+ import stripe
23
+ from dlt.common import pendulum
24
+ from dlt.common.typing import TDataItem
25
+ from pendulum import DateTime
26
+
27
+
28
+ def pagination(
29
+ endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
30
+ ) -> Iterable[TDataItem]:
31
+ """
32
+ Retrieves data from an endpoint with pagination.
33
+
34
+ Args:
35
+ endpoint (str): The endpoint to retrieve data from.
36
+ start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
37
+ end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
38
+
39
+ Returns:
40
+ Iterable[TDataItem]: Data items retrieved from the endpoint.
41
+ """
42
+ starting_after = None
43
+ while True:
44
+ response = stripe_get_data(
45
+ endpoint,
46
+ start_date=start_date,
47
+ end_date=end_date,
48
+ starting_after=starting_after,
49
+ )
50
+
51
+ if len(response["data"]) > 0:
52
+ starting_after = response["data"][-1]["id"]
53
+ yield response["data"]
54
+
55
+ if not response["has_more"]:
56
+ break
57
+
58
+
59
+ def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
60
+ """
61
+ Divide a time range into equal chunks for parallel processing.
62
+
63
+ Args:
64
+ start_ts (int): Start timestamp
65
+ end_ts (int): End timestamp
66
+ num_chunks (int): Number of chunks to create
67
+
68
+ Returns:
69
+ List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
70
+ """
71
+ total_duration = end_ts - start_ts
72
+ chunk_duration = math.ceil(total_duration / num_chunks)
73
+
74
+ chunks = []
75
+ current_start = start_ts
76
+
77
+ for i in range(num_chunks):
78
+ current_end = min(current_start + chunk_duration, end_ts)
79
+ if current_start < end_ts:
80
+ chunks.append((current_start, current_end))
81
+ current_start = current_end
82
+
83
+ if current_start >= end_ts:
84
+ break
85
+
86
+ return chunks
87
+
88
+
89
+ def _create_adaptive_time_chunks(
90
+ start_ts: int, end_ts: int, max_workers: int
91
+ ) -> List[tuple]:
92
+ """
93
+ Create time chunks with adaptive sizing - larger chunks for 2010s (less data expected).
94
+
95
+ Args:
96
+ start_ts (int): Start timestamp
97
+ end_ts (int): End timestamp
98
+ max_workers (int): Maximum number of workers
99
+
100
+ Returns:
101
+ List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
102
+ """
103
+ chunks = []
104
+
105
+ # Key timestamps
106
+ year_2020_ts = int(pendulum.datetime(2020, 1, 1).timestamp())
107
+ year_2015_ts = int(pendulum.datetime(2015, 1, 1).timestamp())
108
+
109
+ current_start = start_ts
110
+
111
+ # Handle 2010-2015: Large chunks (2-3 year periods)
112
+ if current_start < year_2015_ts:
113
+ chunk_end = min(year_2015_ts, end_ts)
114
+ if current_start < chunk_end:
115
+ # Split 2010-2015 into 2-3 chunks max
116
+ pre_2015_chunks = _create_time_chunks(
117
+ current_start, chunk_end, min(3, max_workers)
118
+ )
119
+ chunks.extend(pre_2015_chunks)
120
+ current_start = chunk_end
121
+
122
+ # Handle 2015-2020: Medium chunks (6 month to 1 year periods)
123
+ if current_start < year_2020_ts and current_start < end_ts:
124
+ chunk_end = min(year_2020_ts, end_ts)
125
+ if current_start < chunk_end:
126
+ # Split 2015-2020 into smaller chunks
127
+ duration_2015_2020 = chunk_end - current_start
128
+ years_2015_2020 = duration_2015_2020 / (365 * 24 * 60 * 60)
129
+ num_chunks_2015_2020 = min(
130
+ max_workers, max(2, int(years_2015_2020 * 2))
131
+ ) # ~6 months per chunk
132
+
133
+ pre_2020_chunks = _create_time_chunks(
134
+ current_start, chunk_end, num_chunks_2015_2020
135
+ )
136
+ chunks.extend(pre_2020_chunks)
137
+ current_start = chunk_end
138
+
139
+ if current_start < end_ts:
140
+ # Split post-2020 data into daily chunks for maximum granularity
141
+ current_chunk_start = current_start
142
+ while current_chunk_start < end_ts:
143
+ # Calculate end of current day
144
+ current_date = datetime.fromtimestamp(current_chunk_start)
145
+ next_day = current_date + timedelta(days=1)
146
+ chunk_end = min(int(next_day.timestamp()), end_ts)
147
+
148
+ chunks.append((current_chunk_start, chunk_end))
149
+ current_chunk_start = chunk_end
150
+
151
+ return chunks
152
+
153
+
154
+ def _fetch_chunk_data_streaming(
155
+ endpoint: str, start_ts: int, end_ts: int
156
+ ) -> List[List[TDataItem]]:
157
+ """
158
+ Fetch data for a specific time chunk using sequential pagination with memory-efficient approach.
159
+
160
+ Args:
161
+ endpoint (str): The Stripe endpoint to fetch from
162
+ start_ts (int): Start timestamp for this chunk
163
+ end_ts (int): End timestamp for this chunk
164
+
165
+ Returns:
166
+ List[List[TDataItem]]: List of batches of data items
167
+ """
168
+ # For streaming, we still need to collect the chunk data to maintain order
169
+ # but we can optimize by not holding all data in memory at once
170
+ print(
171
+ f"Fetching chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
172
+ )
173
+ chunk_data = []
174
+ batch_count = 0
175
+
176
+ for batch in pagination(endpoint, start_ts, end_ts):
177
+ chunk_data.append(batch)
178
+ print(
179
+ f"Processed {batch_count} batches for chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
180
+ )
181
+ batch_count += 1
182
+
183
+ return chunk_data
184
+
185
+
186
+ async def async_pagination(
187
+ endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
188
+ ) -> Iterable[TDataItem]:
189
+ """
190
+ Async version of pagination that retrieves data from an endpoint with pagination.
191
+
192
+ Args:
193
+ endpoint (str): The endpoint to retrieve data from.
194
+ start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
195
+ end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
196
+
197
+ Returns:
198
+ Iterable[TDataItem]: Data items retrieved from the endpoint.
199
+ """
200
+ starting_after = None
201
+ while True:
202
+ response = await stripe_get_data_async(
203
+ endpoint,
204
+ start_date=start_date,
205
+ end_date=end_date,
206
+ starting_after=starting_after,
207
+ )
208
+
209
+ if len(response["data"]) > 0:
210
+ starting_after = response["data"][-1]["id"]
211
+ yield response["data"]
212
+
213
+ if not response["has_more"]:
214
+ break
215
+
216
+
217
+ async def async_parallel_pagination(
218
+ endpoint: str,
219
+ max_workers: int = 8,
220
+ rate_limit_delay: float = 5,
221
+ ) -> Iterable[TDataItem]:
222
+ """
223
+ ULTRA-FAST async parallel pagination - yields data in random order for maximum speed.
224
+ No ordering constraints - pure performance optimization.
225
+
226
+ Args:
227
+ endpoint (str): The endpoint to retrieve data from.
228
+ start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
229
+ end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
230
+ max_workers (int): Maximum number of concurrent async tasks. Defaults to 8 for balanced speed/rate limit respect.
231
+ rate_limit_delay (float): Minimal delay between requests. Defaults to 5 seconds.
232
+
233
+ Returns:
234
+ Iterable[TDataItem]: Data items retrieved from the endpoint (RANDOM ORDER FOR SPEED).
235
+ """
236
+
237
+ start_date = pendulum.datetime(2010, 1, 1)
238
+ end_date = pendulum.now()
239
+ start_ts = transform_date(start_date)
240
+ end_ts = transform_date(end_date)
241
+
242
+ # Create time chunks with larger chunks for 2010s (less data expected)
243
+ time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
244
+
245
+ # Use asyncio semaphore to control concurrency and respect rate limits
246
+ semaphore = asyncio.Semaphore(max_workers)
247
+
248
+ async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
249
+ async with semaphore:
250
+ return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
251
+
252
+ # Create all tasks
253
+ tasks = [
254
+ fetch_chunk_with_semaphore(chunk_start, chunk_end)
255
+ for chunk_start, chunk_end in time_chunks
256
+ ]
257
+
258
+ for coro in asyncio.as_completed(tasks):
259
+ try:
260
+ chunk_data = await coro
261
+
262
+ for batch in chunk_data:
263
+ yield batch
264
+
265
+ except Exception as exc:
266
+ print(f"Async chunk processing generated an exception: {exc}")
267
+ raise exc
268
+
269
+
270
+ async def _fetch_chunk_data_async_fast(
271
+ endpoint: str, start_ts: int, end_ts: int
272
+ ) -> List[List[TDataItem]]:
273
+ """
274
+ ULTRA-FAST async chunk fetcher - no metadata overhead, direct data return.
275
+
276
+ Args:
277
+ endpoint (str): The Stripe endpoint to fetch from
278
+ start_ts (int): Start timestamp for this chunk
279
+ end_ts (int): End timestamp for this chunk
280
+
281
+ Returns:
282
+ List[List[TDataItem]]: Raw batches with zero overhead
283
+ """
284
+ chunk_data = []
285
+ async for batch in async_pagination(endpoint, start_ts, end_ts):
286
+ chunk_data.append(batch)
287
+
288
+ return chunk_data
289
+
290
+
291
+ def generate_date_ranges(start_ts: int, end_ts: int) -> Iterable[Dict[str, int]]:
292
+ """Generate hourly date range dicts for parallel processing.
293
+
294
+ Args:
295
+ start_ts (int): Start timestamp (unix)
296
+ end_ts (int): End timestamp (unix)
297
+
298
+ Yields:
299
+ Dict[str, int]: Dictionary with 'start_ts' and 'end_ts' keys for each hour
300
+ """
301
+ current_ts = start_ts
302
+
303
+ while current_ts < end_ts:
304
+ next_hour = (current_ts // 3600 + 1) * 3600
305
+ next_ts = min(next_hour, end_ts)
306
+ yield {"start_ts": current_ts, "end_ts": next_ts}
307
+ current_ts = next_ts
308
+
309
+
310
+ def transform_date(date: Union[str, DateTime, int]) -> int:
311
+ if isinstance(date, str):
312
+ date = pendulum.from_format(date, "%Y-%m-%dT%H:%M:%SZ")
313
+ if isinstance(date, DateTime):
314
+ # convert to unix timestamp
315
+ date = int(date.timestamp())
316
+ return date
317
+
318
+
319
+ def stripe_get_data(
320
+ resource: str,
321
+ start_date: Optional[Any] = None,
322
+ end_date: Optional[Any] = None,
323
+ **kwargs: Any,
324
+ ) -> Dict[Any, Any]:
325
+ if start_date:
326
+ start_date = transform_date(start_date)
327
+ if end_date:
328
+ end_date = transform_date(end_date)
329
+
330
+ if resource == "Subscription":
331
+ kwargs.update({"status": "all"})
332
+
333
+ resource_dict = getattr(stripe, resource).list(
334
+ created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
335
+ )
336
+ return dict(resource_dict)
337
+
338
+
339
+ async def stripe_get_data_async(
340
+ resource: str,
341
+ start_date: Optional[Any] = None,
342
+ end_date: Optional[Any] = None,
343
+ **kwargs: Any,
344
+ ) -> Dict[Any, Any]:
345
+ """Async version of stripe_get_data"""
346
+ if start_date:
347
+ start_date = transform_date(start_date)
348
+ if end_date:
349
+ end_date = transform_date(end_date)
350
+
351
+ if resource == "Subscription":
352
+ kwargs.update({"status": "all"})
353
+
354
+ import asyncio
355
+
356
+ from stripe import RateLimitError
357
+
358
+ max_retries = 50
359
+ retry_count = 0
360
+ max_wait_time_ms = 10000
361
+
362
+ while retry_count < max_retries:
363
+ # print(
364
+ # f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
365
+ # flush=True,
366
+ # )
367
+ try:
368
+ resource_dict = await getattr(stripe, resource).list_async(
369
+ created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
370
+ )
371
+ return dict(resource_dict)
372
+ except RateLimitError:
373
+ retry_count += 1
374
+ if retry_count < max_retries:
375
+ wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
376
+ print(
377
+ f"Got rate limited, sleeping {wait_time} seconds before retrying...",
378
+ flush=True,
379
+ )
380
+ await asyncio.sleep(wait_time)
381
+ else:
382
+ # Re-raise the last exception if we've exhausted retries
383
+ print(f"✗ Failed to fetch {resource} after {max_retries} retries")
384
+ raise
385
+
386
+ return dict(resource_dict)