ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -7,13 +7,16 @@ import stripe
7
7
  from dlt.sources import DltResource
8
8
  from pendulum import DateTime
9
9
 
10
- from .helpers import pagination, transform_date
11
- from .settings import ENDPOINTS, INCREMENTAL_ENDPOINTS
10
+ from .helpers import (
11
+ async_parallel_pagination,
12
+ pagination,
13
+ transform_date,
14
+ )
12
15
 
13
16
 
14
17
  @dlt.source(max_table_nesting=0)
15
18
  def stripe_source(
16
- endpoints: Tuple[str, ...] = ENDPOINTS,
19
+ endpoints: Tuple[str, ...],
17
20
  stripe_secret_key: str = dlt.secrets.value,
18
21
  start_date: Optional[DateTime] = None,
19
22
  end_date: Optional[DateTime] = None,
@@ -51,32 +54,55 @@ def stripe_source(
51
54
  )(endpoint)
52
55
 
53
56
 
54
- @dlt.source
55
- def incremental_stripe_source(
56
- endpoints: Tuple[str, ...] = INCREMENTAL_ENDPOINTS,
57
+ @dlt.source(max_table_nesting=0)
58
+ def async_stripe_source(
59
+ endpoints: Tuple[str, ...],
57
60
  stripe_secret_key: str = dlt.secrets.value,
58
- initial_start_date: Optional[DateTime] = None,
61
+ start_date: Optional[DateTime] = None,
59
62
  end_date: Optional[DateTime] = None,
63
+ max_workers: int = 4,
64
+ rate_limit_delay: float = 0.03,
60
65
  ) -> Iterable[DltResource]:
61
66
  """
62
- As Stripe API does not include the "updated" key in its responses,
63
- we are only able to perform incremental downloads from endpoints where all objects are uneditable.
64
- This source yields the resources with incremental loading based on "append" mode.
65
- You will load only the newest data without duplicating and without downloading a huge amount of data each time.
67
+ ULTRA-FAST async Stripe source optimized for maximum speed and throughput.
68
+
69
+ WARNING: Returns data in RANDOM ORDER for maximum performance.
70
+ Uses aggressive concurrency and minimal delays to maximize API throughput.
66
71
 
67
72
  Args:
68
- endpoints (tuple): A tuple of endpoint names to retrieve data from. Defaults to Stripe API endpoints with uneditable data.
73
+ endpoints (Tuple[str, ...]): A tuple of endpoint names to retrieve data from.
69
74
  stripe_secret_key (str): The API access token for authentication. Defaults to the value in the `dlt.secrets` object.
70
- initial_start_date (Optional[DateTime]): An optional parameter that specifies the initial value for dlt.sources.incremental.
71
- If parameter is not None, then load only data that were created after initial_start_date on the first run.
72
- Defaults to None. Format: datetime(YYYY, MM, DD).
73
- end_date (Optional[DateTime]): An optional end date to limit the data retrieved.
74
- Defaults to None. Format: datetime(YYYY, MM, DD).
75
+ start_date (Optional[DateTime]): An optional start date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to 2010-01-01.
76
+ end_date (Optional[DateTime]): An optional end date to limit the data retrieved. Format: datetime(YYYY, MM, DD). Defaults to today.
77
+ max_workers (int): Maximum number of concurrent async tasks. Defaults to 40 for maximum speed.
78
+ rate_limit_delay (float): Minimal delay between requests. Defaults to 0.03 seconds.
79
+
75
80
  Returns:
76
- Iterable[DltResource]: Resources with only that data has not yet been loaded.
81
+ Iterable[DltResource]: Resources with data in RANDOM ORDER (optimized for speed).
77
82
  """
78
83
  stripe.api_key = stripe_secret_key
79
84
  stripe.api_version = "2022-11-15"
85
+
86
+ async def async_stripe_resource(endpoint: str):
87
+ yield async_parallel_pagination(endpoint, max_workers, rate_limit_delay)
88
+
89
+ for endpoint in endpoints:
90
+ yield dlt.resource(
91
+ async_stripe_resource,
92
+ name=endpoint,
93
+ write_disposition="replace",
94
+ )(endpoint)
95
+
96
+
97
+ @dlt.source(max_table_nesting=0)
98
+ def incremental_stripe_source(
99
+ endpoints: Tuple[str, ...],
100
+ stripe_secret_key: str = dlt.secrets.value,
101
+ initial_start_date: Optional[DateTime] = None,
102
+ end_date: Optional[DateTime] = None,
103
+ ) -> Iterable[DltResource]:
104
+ stripe.api_key = stripe_secret_key
105
+ stripe.api_version = "2022-11-15"
80
106
  start_date_unix = (
81
107
  transform_date(initial_start_date) if initial_start_date is not None else -1
82
108
  )
@@ -86,17 +112,19 @@ def incremental_stripe_source(
86
112
  created: Optional[Any] = dlt.sources.incremental(
87
113
  "created",
88
114
  initial_value=start_date_unix,
115
+ end_value=transform_date(end_date) if end_date is not None else None,
89
116
  range_end="closed",
90
117
  range_start="closed",
91
118
  ),
92
119
  ) -> Generator[Dict[Any, Any], Any, None]:
93
- start_value = created.last_value
94
- yield from pagination(endpoint, start_date=start_value, end_date=end_date)
120
+ yield from pagination(
121
+ endpoint, start_date=created.last_value, end_date=created.end_value
122
+ )
95
123
 
96
124
  for endpoint in endpoints:
97
125
  yield dlt.resource(
98
126
  incremental_resource,
99
127
  name=endpoint,
100
- write_disposition="append",
128
+ write_disposition="merge",
101
129
  primary_key="id",
102
130
  )(endpoint)
@@ -1,6 +1,9 @@
1
1
  """Stripe analytics source helpers"""
2
2
 
3
- from typing import Any, Dict, Iterable, Optional, Union
3
+ import asyncio
4
+ import math
5
+ from datetime import datetime, timedelta
6
+ from typing import Any, Dict, Iterable, List, Optional, Union
4
7
 
5
8
  import stripe
6
9
  from dlt.common import pendulum
@@ -39,6 +42,238 @@ def pagination(
39
42
  break
40
43
 
41
44
 
45
+ def _create_time_chunks(start_ts: int, end_ts: int, num_chunks: int) -> List[tuple]:
46
+ """
47
+ Divide a time range into equal chunks for parallel processing.
48
+
49
+ Args:
50
+ start_ts (int): Start timestamp
51
+ end_ts (int): End timestamp
52
+ num_chunks (int): Number of chunks to create
53
+
54
+ Returns:
55
+ List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
56
+ """
57
+ total_duration = end_ts - start_ts
58
+ chunk_duration = math.ceil(total_duration / num_chunks)
59
+
60
+ chunks = []
61
+ current_start = start_ts
62
+
63
+ for i in range(num_chunks):
64
+ current_end = min(current_start + chunk_duration, end_ts)
65
+ if current_start < end_ts:
66
+ chunks.append((current_start, current_end))
67
+ current_start = current_end
68
+
69
+ if current_start >= end_ts:
70
+ break
71
+
72
+ return chunks
73
+
74
+
75
+ def _create_adaptive_time_chunks(
76
+ start_ts: int, end_ts: int, max_workers: int
77
+ ) -> List[tuple]:
78
+ """
79
+ Create time chunks with adaptive sizing - larger chunks for 2010s (less data expected).
80
+
81
+ Args:
82
+ start_ts (int): Start timestamp
83
+ end_ts (int): End timestamp
84
+ max_workers (int): Maximum number of workers
85
+
86
+ Returns:
87
+ List[tuple]: List of (chunk_start, chunk_end) timestamp pairs
88
+ """
89
+ chunks = []
90
+
91
+ # Key timestamps
92
+ year_2020_ts = int(pendulum.datetime(2020, 1, 1).timestamp())
93
+ year_2015_ts = int(pendulum.datetime(2015, 1, 1).timestamp())
94
+
95
+ current_start = start_ts
96
+
97
+ # Handle 2010-2015: Large chunks (2-3 year periods)
98
+ if current_start < year_2015_ts:
99
+ chunk_end = min(year_2015_ts, end_ts)
100
+ if current_start < chunk_end:
101
+ # Split 2010-2015 into 2-3 chunks max
102
+ pre_2015_chunks = _create_time_chunks(
103
+ current_start, chunk_end, min(3, max_workers)
104
+ )
105
+ chunks.extend(pre_2015_chunks)
106
+ current_start = chunk_end
107
+
108
+ # Handle 2015-2020: Medium chunks (6 month to 1 year periods)
109
+ if current_start < year_2020_ts and current_start < end_ts:
110
+ chunk_end = min(year_2020_ts, end_ts)
111
+ if current_start < chunk_end:
112
+ # Split 2015-2020 into smaller chunks
113
+ duration_2015_2020 = chunk_end - current_start
114
+ years_2015_2020 = duration_2015_2020 / (365 * 24 * 60 * 60)
115
+ num_chunks_2015_2020 = min(
116
+ max_workers, max(2, int(years_2015_2020 * 2))
117
+ ) # ~6 months per chunk
118
+
119
+ pre_2020_chunks = _create_time_chunks(
120
+ current_start, chunk_end, num_chunks_2015_2020
121
+ )
122
+ chunks.extend(pre_2020_chunks)
123
+ current_start = chunk_end
124
+
125
+ if current_start < end_ts:
126
+ # Split post-2020 data into daily chunks for maximum granularity
127
+ current_chunk_start = current_start
128
+ while current_chunk_start < end_ts:
129
+ # Calculate end of current day
130
+ current_date = datetime.fromtimestamp(current_chunk_start)
131
+ next_day = current_date + timedelta(days=1)
132
+ chunk_end = min(int(next_day.timestamp()), end_ts)
133
+
134
+ chunks.append((current_chunk_start, chunk_end))
135
+ current_chunk_start = chunk_end
136
+
137
+ return chunks
138
+
139
+
140
+ def _fetch_chunk_data_streaming(
141
+ endpoint: str, start_ts: int, end_ts: int
142
+ ) -> List[List[TDataItem]]:
143
+ """
144
+ Fetch data for a specific time chunk using sequential pagination with memory-efficient approach.
145
+
146
+ Args:
147
+ endpoint (str): The Stripe endpoint to fetch from
148
+ start_ts (int): Start timestamp for this chunk
149
+ end_ts (int): End timestamp for this chunk
150
+
151
+ Returns:
152
+ List[List[TDataItem]]: List of batches of data items
153
+ """
154
+ # For streaming, we still need to collect the chunk data to maintain order
155
+ # but we can optimize by not holding all data in memory at once
156
+ print(
157
+ f"Fetching chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
158
+ )
159
+ chunk_data = []
160
+ batch_count = 0
161
+
162
+ for batch in pagination(endpoint, start_ts, end_ts):
163
+ chunk_data.append(batch)
164
+ print(
165
+ f"Processed {batch_count} batches for chunk {datetime.fromtimestamp(start_ts).strftime('%Y-%m-%d')}-{datetime.fromtimestamp(end_ts).strftime('%Y-%m-%d')}"
166
+ )
167
+ batch_count += 1
168
+
169
+ return chunk_data
170
+
171
+
172
+ async def async_pagination(
173
+ endpoint: str, start_date: Optional[Any] = None, end_date: Optional[Any] = None
174
+ ) -> Iterable[TDataItem]:
175
+ """
176
+ Async version of pagination that retrieves data from an endpoint with pagination.
177
+
178
+ Args:
179
+ endpoint (str): The endpoint to retrieve data from.
180
+ start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to None.
181
+ end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to None.
182
+
183
+ Returns:
184
+ Iterable[TDataItem]: Data items retrieved from the endpoint.
185
+ """
186
+ starting_after = None
187
+ while True:
188
+ response = await stripe_get_data_async(
189
+ endpoint,
190
+ start_date=start_date,
191
+ end_date=end_date,
192
+ starting_after=starting_after,
193
+ )
194
+
195
+ if len(response["data"]) > 0:
196
+ starting_after = response["data"][-1]["id"]
197
+ yield response["data"]
198
+
199
+ if not response["has_more"]:
200
+ break
201
+
202
+
203
+ async def async_parallel_pagination(
204
+ endpoint: str,
205
+ max_workers: int = 8,
206
+ rate_limit_delay: float = 5,
207
+ ) -> Iterable[TDataItem]:
208
+ """
209
+ ULTRA-FAST async parallel pagination - yields data in random order for maximum speed.
210
+ No ordering constraints - pure performance optimization.
211
+
212
+ Args:
213
+ endpoint (str): The endpoint to retrieve data from.
214
+ start_date (Optional[Any]): An optional start date to limit the data retrieved. Defaults to 2010-01-01 if None.
215
+ end_date (Optional[Any]): An optional end date to limit the data retrieved. Defaults to today if None.
216
+ max_workers (int): Maximum number of concurrent async tasks. Defaults to 8 for balanced speed/rate limit respect.
217
+ rate_limit_delay (float): Minimal delay between requests. Defaults to 5 seconds.
218
+
219
+ Returns:
220
+ Iterable[TDataItem]: Data items retrieved from the endpoint (RANDOM ORDER FOR SPEED).
221
+ """
222
+
223
+ start_date = pendulum.datetime(2010, 1, 1)
224
+ end_date = pendulum.now()
225
+ start_ts = transform_date(start_date)
226
+ end_ts = transform_date(end_date)
227
+
228
+ # Create time chunks with larger chunks for 2010s (less data expected)
229
+ time_chunks = _create_adaptive_time_chunks(start_ts, end_ts, max_workers)
230
+
231
+ # Use asyncio semaphore to control concurrency and respect rate limits
232
+ semaphore = asyncio.Semaphore(max_workers)
233
+
234
+ async def fetch_chunk_with_semaphore(chunk_start: int, chunk_end: int):
235
+ async with semaphore:
236
+ return await _fetch_chunk_data_async_fast(endpoint, chunk_start, chunk_end)
237
+
238
+ # Create all tasks
239
+ tasks = [
240
+ fetch_chunk_with_semaphore(chunk_start, chunk_end)
241
+ for chunk_start, chunk_end in time_chunks
242
+ ]
243
+
244
+ for coro in asyncio.as_completed(tasks):
245
+ try:
246
+ chunk_data = await coro
247
+
248
+ for batch in chunk_data:
249
+ yield batch
250
+
251
+ except Exception as exc:
252
+ print(f"Async chunk processing generated an exception: {exc}")
253
+ raise exc
254
+
255
+
256
+ async def _fetch_chunk_data_async_fast(
257
+ endpoint: str, start_ts: int, end_ts: int
258
+ ) -> List[List[TDataItem]]:
259
+ """
260
+ ULTRA-FAST async chunk fetcher - no metadata overhead, direct data return.
261
+
262
+ Args:
263
+ endpoint (str): The Stripe endpoint to fetch from
264
+ start_ts (int): Start timestamp for this chunk
265
+ end_ts (int): End timestamp for this chunk
266
+
267
+ Returns:
268
+ List[List[TDataItem]]: Raw batches with zero overhead
269
+ """
270
+ chunk_data = []
271
+ async for batch in async_pagination(endpoint, start_ts, end_ts):
272
+ chunk_data.append(batch)
273
+
274
+ return chunk_data
275
+
276
+
42
277
  def transform_date(date: Union[str, DateTime, int]) -> int:
43
278
  if isinstance(date, str):
44
279
  date = pendulum.from_format(date, "%Y-%m-%dT%H:%M:%SZ")
@@ -66,3 +301,53 @@ def stripe_get_data(
66
301
  created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
67
302
  )
68
303
  return dict(resource_dict)
304
+
305
+
306
+ async def stripe_get_data_async(
307
+ resource: str,
308
+ start_date: Optional[Any] = None,
309
+ end_date: Optional[Any] = None,
310
+ **kwargs: Any,
311
+ ) -> Dict[Any, Any]:
312
+ """Async version of stripe_get_data"""
313
+ if start_date:
314
+ start_date = transform_date(start_date)
315
+ if end_date:
316
+ end_date = transform_date(end_date)
317
+
318
+ if resource == "Subscription":
319
+ kwargs.update({"status": "all"})
320
+
321
+ import asyncio
322
+
323
+ from stripe import RateLimitError
324
+
325
+ max_retries = 50
326
+ retry_count = 0
327
+ max_wait_time_ms = 10000
328
+
329
+ while retry_count < max_retries:
330
+ # print(
331
+ # f"Fetching {resource} from {datetime.fromtimestamp(start_date).strftime('%Y-%m-%d %H:%M:%S') if start_date else 'None'} to {datetime.fromtimestamp(end_date).strftime('%Y-%m-%d %H:%M:%S') if end_date else 'None'}, retry {retry_count} of {max_retries}",
332
+ # flush=True,
333
+ # )
334
+ try:
335
+ resource_dict = await getattr(stripe, resource).list_async(
336
+ created={"gte": start_date, "lt": end_date}, limit=100, **kwargs
337
+ )
338
+ return dict(resource_dict)
339
+ except RateLimitError:
340
+ retry_count += 1
341
+ if retry_count < max_retries:
342
+ wait_time = min(2**retry_count * 0.001, max_wait_time_ms)
343
+ print(
344
+ f"Got rate limited, sleeping {wait_time} seconds before retrying...",
345
+ flush=True,
346
+ )
347
+ await asyncio.sleep(wait_time)
348
+ else:
349
+ # Re-raise the last exception if we've exhausted retries
350
+ print(f"✗ Failed to fetch {resource} after {max_retries} retries")
351
+ raise
352
+
353
+ return dict(resource_dict)
@@ -2,13 +2,65 @@
2
2
 
3
3
  # the most popular endpoints
4
4
  # Full list of the Stripe API endpoints you can find here: https://stripe.com/docs/api.
5
- ENDPOINTS = (
6
- "Subscription",
7
- "Account",
8
- "Coupon",
9
- "Customer",
10
- "Product",
11
- "Price",
12
- )
13
- # possible incremental endpoints
14
- INCREMENTAL_ENDPOINTS = ("Event", "Invoice", "BalanceTransaction")
5
+ ENDPOINTS = {
6
+ "account": "Account",
7
+ "applepaydomain": "ApplePayDomain",
8
+ "apple_pay_domain": "ApplePayDomain",
9
+ "applicationfee": "ApplicationFee",
10
+ "application_fee": "ApplicationFee",
11
+ "checkoutsession": "CheckoutSession",
12
+ "checkout_session": "CheckoutSession",
13
+ "coupon": "Coupon",
14
+ "charge": "Charge",
15
+ "customer": "Customer",
16
+ "dispute": "Dispute",
17
+ "paymentintent": "PaymentIntent",
18
+ "payment_intent": "PaymentIntent",
19
+ "paymentlink": "PaymentLink",
20
+ "payment_link": "PaymentLink",
21
+ "paymentmethod": "PaymentMethod",
22
+ "payment_method": "PaymentMethod",
23
+ "paymentmethoddomain": "PaymentMethodDomain",
24
+ "payment_method_domain": "PaymentMethodDomain",
25
+ "payout": "Payout",
26
+ "plan": "Plan",
27
+ "price": "Price",
28
+ "product": "Product",
29
+ "promotioncode": "PromotionCode",
30
+ "promotion_code": "PromotionCode",
31
+ "quote": "Quote",
32
+ "refund": "Refund",
33
+ "review": "Review",
34
+ "setupattempt": "SetupAttempt",
35
+ "setup_attempt": "SetupAttempt",
36
+ "setupintent": "SetupIntent",
37
+ "setup_intent": "SetupIntent",
38
+ "shippingrate": "ShippingRate",
39
+ "shipping_rate": "ShippingRate",
40
+ "subscription": "Subscription",
41
+ "subscriptionitem": "SubscriptionItem",
42
+ "subscription_item": "SubscriptionItem",
43
+ "subscriptionschedule": "SubscriptionSchedule",
44
+ "subscription_schedule": "SubscriptionSchedule",
45
+ "transfer": "Transfer",
46
+ "taxcode": "TaxCode",
47
+ "tax_code": "TaxCode",
48
+ "taxid": "TaxId",
49
+ "tax_id": "TaxId",
50
+ "taxrate": "TaxRate",
51
+ "tax_rate": "TaxRate",
52
+ "topup": "Topup",
53
+ "top_up": "Topup",
54
+ "webhookendpoint": "WebhookEndpoint",
55
+ "webhook_endpoint": "WebhookEndpoint",
56
+ "invoice": "Invoice",
57
+ "invoiceitem": "InvoiceItem",
58
+ "invoice_item": "InvoiceItem",
59
+ "invoicelineitem": "InvoiceLineItem",
60
+ "invoice_line_item": "InvoiceLineItem",
61
+ "balancetransaction": "BalanceTransaction",
62
+ "balance_transaction": "BalanceTransaction",
63
+ "creditnote": "CreditNote",
64
+ "credit_note": "CreditNote",
65
+ "event": "Event",
66
+ }
@@ -1,13 +1,4 @@
1
1
  import os
2
- import platform
3
-
4
- import machineid
5
- import rudderstack.analytics as rudder_analytics # type: ignore
6
-
7
- from ingestr.src.version import __version__ # type: ignore
8
-
9
- rudder_analytics.write_key = "2cUr13DDQcX2x2kAfMEfdrKvrQa"
10
- rudder_analytics.dataPlaneUrl = "https://getbruinbumlky.dataplane.rudderstack.com"
11
2
 
12
3
 
13
4
  def track(event_name, event_properties: dict):
@@ -16,6 +7,16 @@ def track(event_name, event_properties: dict):
16
7
  ):
17
8
  return
18
9
 
10
+ import platform
11
+
12
+ import machineid
13
+ import rudderstack.analytics as rudder_analytics # type: ignore
14
+
15
+ from ingestr.src.version import __version__ # type: ignore
16
+
17
+ rudder_analytics.write_key = "2cUr13DDQcX2x2kAfMEfdrKvrQa"
18
+ rudder_analytics.dataPlaneUrl = "https://getbruinbumlky.dataplane.rudderstack.com"
19
+
19
20
  try:
20
21
  if not event_properties:
21
22
  event_properties = {}
@@ -112,7 +112,8 @@ def tiktok_source(
112
112
  datetime=(
113
113
  dlt.sources.incremental(
114
114
  incremental_loading_param,
115
- start_date,
115
+ initial_value=start_date,
116
+ end_value=end_date,
116
117
  range_end="closed",
117
118
  range_start="closed",
118
119
  )
@@ -120,15 +121,20 @@ def tiktok_source(
120
121
  else None
121
122
  ),
122
123
  ) -> Iterable[TDataItem]:
123
- current_date = start_date.in_tz(timezone)
124
+ start_date_tz_adjusted = start_date.in_tz(timezone)
125
+ end_date_tz_adjusted = end_date.in_tz(timezone)
124
126
 
125
127
  if datetime is not None:
126
- datetime_str = datetime.last_value
127
- current_date = ensure_pendulum_datetime(datetime_str).in_tz(timezone)
128
+ start_date_tz_adjusted = ensure_pendulum_datetime(
129
+ datetime.last_value
130
+ ).in_tz(timezone)
131
+ end_date_tz_adjusted = ensure_pendulum_datetime(datetime.end_value).in_tz(
132
+ timezone
133
+ )
128
134
 
129
135
  list_of_interval = find_intervals(
130
- current_date=current_date,
131
- end_date=end_date,
136
+ current_date=start_date_tz_adjusted,
137
+ end_date=end_date_tz_adjusted,
132
138
  interval_days=interval_days,
133
139
  )
134
140
 
@@ -17,7 +17,6 @@ def retry_on_limit(
17
17
 
18
18
  def create_client() -> requests.Session:
19
19
  return Client(
20
- request_timeout=10.0,
21
20
  raise_for_status=False,
22
21
  retry_condition=retry_on_limit,
23
22
  request_max_attempts=12,
@@ -0,0 +1,48 @@
1
+ """Trustpilot source for ingesting reviews."""
2
+
3
+ from typing import Any, Dict, Generator, Iterable
4
+
5
+ import dlt
6
+ import pendulum
7
+ from dlt.sources import DltResource
8
+
9
+ from .client import TrustpilotClient
10
+
11
+
12
+ @dlt.source()
13
+ def trustpilot_source(
14
+ business_unit_id: str,
15
+ start_date: str,
16
+ end_date: str | None,
17
+ api_key: str,
18
+ per_page: int = 1000,
19
+ ) -> Iterable[DltResource]:
20
+ """Return resources for Trustpilot."""
21
+
22
+ client = TrustpilotClient(api_key=api_key)
23
+
24
+ @dlt.resource(name="reviews", write_disposition="merge", primary_key="id")
25
+ def reviews(
26
+ dateTime=(
27
+ dlt.sources.incremental(
28
+ "updated_at",
29
+ initial_value=start_date,
30
+ end_value=end_date,
31
+ range_start="closed",
32
+ range_end="closed",
33
+ )
34
+ ),
35
+ ) -> Generator[Dict[str, Any], None, None]:
36
+ if end_date is None:
37
+ end_dt = pendulum.now(tz="UTC").isoformat()
38
+ else:
39
+ end_dt = dateTime.end_value
40
+ start_dt = dateTime.last_value
41
+ yield from client.paginated_reviews(
42
+ business_unit_id=business_unit_id,
43
+ per_page=per_page,
44
+ updated_since=start_dt,
45
+ end_date=end_dt,
46
+ )
47
+
48
+ yield reviews
@@ -0,0 +1,48 @@
1
+ """Simple Trustpilot API client."""
2
+
3
+ from typing import Any, Dict, Iterable
4
+
5
+ import pendulum
6
+ from dlt.sources.helpers import requests
7
+
8
+
9
+ class TrustpilotClient:
10
+ """Client for the Trustpilot public API."""
11
+
12
+ def __init__(self, api_key: str) -> None:
13
+ self.api_key = api_key
14
+ self.base_url = "https://api.trustpilot.com/v1"
15
+
16
+ def _get(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]:
17
+ params = dict(params)
18
+ params["apikey"] = self.api_key
19
+ response = requests.get(f"{self.base_url}{endpoint}", params=params)
20
+ response.raise_for_status()
21
+ return response.json()
22
+
23
+ def paginated_reviews(
24
+ self,
25
+ business_unit_id: str,
26
+ updated_since: str,
27
+ end_date: str,
28
+ per_page: int = 1000,
29
+ ) -> Iterable[Dict[str, Any]]:
30
+ page = 1
31
+ while True:
32
+ params: Dict[str, Any] = {"perPage": per_page, "page": page}
33
+ if updated_since:
34
+ params["updatedSince"] = updated_since
35
+ data = self._get(f"/business-units/{business_unit_id}/reviews", params)
36
+ reviews = data.get("reviews", data)
37
+ if not reviews:
38
+ break
39
+ for review in reviews:
40
+ end_date_dt = pendulum.parse(end_date)
41
+ review["updated_at"] = review["updatedAt"]
42
+ review_dt = pendulum.parse(review["updated_at"])
43
+ if review_dt > end_date_dt: # type: ignore
44
+ continue
45
+ yield review
46
+ if len(reviews) < per_page:
47
+ break
48
+ page += 1
ingestr/src/version.py CHANGED
@@ -1 +1,6 @@
1
- __version__ = "0.13.2"
1
+ try:
2
+ from ingestr.src import buildinfo # type: ignore[import-not-found,attr-defined]
3
+
4
+ __version__ = buildinfo.version.lstrip("v")
5
+ except ImportError:
6
+ __version__ = "0.0.0-dev"