ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,32 @@
1
+ from typing import Dict
2
+
3
+ from dlt.common.schema.typing import TColumnSchema
4
+ from dlt.sources import DltResource, DltSource
5
+
6
+ import ingestr.src.resource as resource
7
+
8
+
9
+ def apply_athena_hints(
10
+ source: DltSource | DltResource,
11
+ partition_column: str,
12
+ additional_hints: Dict[str, TColumnSchema] = {},
13
+ ) -> None:
14
+ from dlt.destinations.adapters import athena_adapter, athena_partition
15
+
16
+ def _apply_partition_hint(resource: DltResource) -> None:
17
+ columns = resource.columns if resource.columns else {}
18
+
19
+ partition_hint = (
20
+ columns.get(partition_column) # type: ignore
21
+ or additional_hints.get(partition_column)
22
+ )
23
+
24
+ athena_adapter(
25
+ resource,
26
+ athena_partition.day(partition_column)
27
+ if partition_hint
28
+ and partition_hint.get("data_type") in ("timestamp", "date")
29
+ else partition_column,
30
+ )
31
+
32
+ resource.for_each(source, _apply_partition_hint)
@@ -0,0 +1,331 @@
1
+ """Fetches Personio Employees, Absences, Attendances."""
2
+
3
+ from typing import Iterable, Optional
4
+
5
+ import dlt
6
+ from dlt.common import pendulum
7
+ from dlt.common.time import ensure_pendulum_datetime
8
+ from dlt.common.typing import TAnyDateTime, TDataItem
9
+ from dlt.sources import DltResource
10
+
11
+ from .helpers import PersonioAPI
12
+
13
+
14
+ @dlt.source(name="personio", max_table_nesting=0)
15
+ def personio_source(
16
+ start_date: TAnyDateTime,
17
+ end_date: Optional[TAnyDateTime] = None,
18
+ client_id: str = dlt.secrets.value,
19
+ client_secret: str = dlt.secrets.value,
20
+ items_per_page: int = 200,
21
+ ) -> Iterable[DltResource]:
22
+ """
23
+ The source for the Personio pipeline. Available resources are employees, absences, and attendances.
24
+
25
+ Args:
26
+ client_id: The client ID of your app.
27
+ client_secret: The client secret of your app.
28
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
29
+ Returns:
30
+ Iterable: A list of DltResource objects representing the data resources.
31
+ """
32
+
33
+ client = PersonioAPI(client_id, client_secret)
34
+
35
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
36
+ def employees(
37
+ updated_at: dlt.sources.incremental[
38
+ pendulum.DateTime
39
+ ] = dlt.sources.incremental(
40
+ "last_modified_at", initial_value=None, allow_external_schedulers=True
41
+ ),
42
+ items_per_page: int = items_per_page,
43
+ ) -> Iterable[TDataItem]:
44
+ """
45
+ The resource for employees, supports incremental loading and pagination.
46
+
47
+ Args:
48
+ updated_at: The saved state of the last 'last_modified_at' value.
49
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
50
+
51
+ Returns:
52
+ Iterable: A generator of employees.
53
+ """
54
+
55
+ def convert_item(item: TDataItem) -> TDataItem:
56
+ """Converts an employee item."""
57
+ attributes = item.get("attributes", {})
58
+ output = {}
59
+ for value in attributes.values():
60
+ name = value["universal_id"]
61
+ if not name:
62
+ label: str = value["label"].replace(" ", "_")
63
+ name = label.lower()
64
+
65
+ if value["type"] == "date" and value["value"]:
66
+ output[name] = ensure_pendulum_datetime(value["value"])
67
+ else:
68
+ output[name] = value["value"]
69
+ return output
70
+
71
+ if updated_at.last_value:
72
+ last_value = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
73
+ else:
74
+ last_value = None
75
+
76
+ params = {"limit": items_per_page, "updated_since": last_value}
77
+
78
+ pages = client.get_pages("company/employees", params=params)
79
+ for page in pages:
80
+ yield [convert_item(item) for item in page]
81
+
82
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
83
+ def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]:
84
+ """
85
+ The resource for absence types (time-off-types), supports pagination.
86
+
87
+ Args:
88
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
89
+
90
+ Returns:
91
+ Iterable: A generator of absences.
92
+ """
93
+
94
+ pages = client.get_pages(
95
+ "company/time-off-types", params={"limit": items_per_page}
96
+ )
97
+
98
+ for page in pages:
99
+ yield [item.get("attributes", {}) for item in page]
100
+
101
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
102
+ def absences(
103
+ updated_at: dlt.sources.incremental[
104
+ pendulum.DateTime
105
+ ] = dlt.sources.incremental(
106
+ "updated_at", initial_value=None, allow_external_schedulers=True
107
+ ),
108
+ items_per_page: int = items_per_page,
109
+ ) -> Iterable[TDataItem]:
110
+ """
111
+ The resource for absence (time-offs), supports incremental loading and pagination.
112
+
113
+ Args:
114
+ updated_at: The saved state of the last 'updated_at' value.
115
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
116
+
117
+ Returns:
118
+ Iterable: A generator of absences.
119
+ """
120
+ if updated_at.last_value:
121
+ updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
122
+ else:
123
+ updated_iso = None
124
+
125
+ params = {
126
+ "limit": items_per_page,
127
+ "updated_since": updated_iso,
128
+ }
129
+
130
+ def convert_item(item: TDataItem) -> TDataItem:
131
+ output = item.get("attributes", {})
132
+ output["created_at"] = ensure_pendulum_datetime(output["created_at"])
133
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
134
+ return output
135
+
136
+ pages = client.get_pages(
137
+ "company/time-offs",
138
+ params=params,
139
+ offset_by_page=True,
140
+ )
141
+
142
+ for page in pages:
143
+ yield [convert_item(item) for item in page]
144
+
145
+ @dlt.resource(primary_key="id", write_disposition="merge", max_table_nesting=0)
146
+ def attendances(
147
+ start_date: TAnyDateTime = start_date,
148
+ end_date: Optional[TAnyDateTime] = end_date,
149
+ updated_at: dlt.sources.incremental[
150
+ pendulum.DateTime
151
+ ] = dlt.sources.incremental(
152
+ "updated_at", initial_value=None, allow_external_schedulers=True
153
+ ),
154
+ items_per_page: int = items_per_page,
155
+ ) -> Iterable[TDataItem]:
156
+ """
157
+ The resource for attendances, supports incremental loading and pagination.
158
+
159
+ Args:
160
+ start_date: The start date to fetch attendances from.
161
+ end_date: The end date to fetch attendances from. Defaults to now.
162
+ updated_at: The saved state of the last 'updated_at' value.
163
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
164
+
165
+ Returns:
166
+ Iterable: A generator of attendances.
167
+ """
168
+
169
+ end_date = end_date or pendulum.now()
170
+ if updated_at.last_value:
171
+ updated_iso = updated_at.last_value.format("YYYY-MM-DDTHH:mm:ss")
172
+ else:
173
+ updated_iso = None
174
+
175
+ params = {
176
+ "limit": items_per_page,
177
+ "start_date": ensure_pendulum_datetime(start_date).to_date_string(),
178
+ "end_date": ensure_pendulum_datetime(end_date).to_date_string(),
179
+ "updated_from": updated_iso,
180
+ "includePending": True,
181
+ }
182
+ pages = client.get_pages(
183
+ "company/attendances",
184
+ params=params,
185
+ )
186
+
187
+ def convert_item(item: TDataItem) -> TDataItem:
188
+ """Converts an attendance item."""
189
+ output = dict(id=item["id"], **item.get("attributes"))
190
+ output["date"] = ensure_pendulum_datetime(output["date"]).date()
191
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
192
+ return output
193
+
194
+ for page in pages:
195
+ yield [convert_item(item) for item in page]
196
+
197
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
198
+ def projects() -> Iterable[TDataItem]:
199
+ """
200
+ The resource for projects.
201
+
202
+ Returns:
203
+ Iterable: A generator of projects.
204
+ """
205
+
206
+ pages = client.get_pages("company/attendances/projects")
207
+
208
+ def convert_item(item: TDataItem) -> TDataItem:
209
+ """Converts an attendance item."""
210
+ output = dict(id=item["id"], **item.get("attributes"))
211
+ output["created_at"] = ensure_pendulum_datetime(output["created_at"])
212
+ output["updated_at"] = ensure_pendulum_datetime(output["updated_at"])
213
+ return output
214
+
215
+ for page in pages:
216
+ yield [convert_item(item) for item in page]
217
+
218
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
219
+ def document_categories() -> Iterable[TDataItem]:
220
+ """
221
+ The resource for document_categories.
222
+
223
+ Returns:
224
+ Iterable: A generator of document_categories.
225
+ """
226
+
227
+ pages = client.get_pages("company/document-categories")
228
+
229
+ def convert_item(item: TDataItem) -> TDataItem:
230
+ """Converts an document_categories item."""
231
+ output = dict(id=item["id"], **item.get("attributes"))
232
+ return output
233
+
234
+ for page in pages:
235
+ yield [convert_item(item) for item in page]
236
+
237
+ @dlt.resource(primary_key="id", write_disposition="replace", max_table_nesting=0)
238
+ def custom_reports_list() -> Iterable[TDataItem]:
239
+ """
240
+ The resource for custom_reports.
241
+
242
+ Returns:
243
+ Iterable: A generator of custom_reports.
244
+ """
245
+
246
+ pages = client.get_pages("company/custom-reports/reports")
247
+
248
+ for page in pages:
249
+ yield [item.get("attributes", {}) for item in page]
250
+
251
+ @dlt.transformer(
252
+ data_from=employees,
253
+ write_disposition="merge",
254
+ primary_key=["employee_id", "id"],
255
+ )
256
+ @dlt.defer
257
+ def employees_absences_balance(employees_item: TDataItem) -> Iterable[TDataItem]:
258
+ """
259
+ The transformer for employees_absences_balance.
260
+
261
+ Args:
262
+ employees_item: The employee data.
263
+
264
+ Returns:
265
+ Iterable: A generator of employees_absences_balance for each employee.
266
+ """
267
+ for employee in employees_item:
268
+ employee_id = employee["id"]
269
+ pages = client.get_pages(
270
+ f"company/employees/{employee_id}/absences/balance",
271
+ )
272
+
273
+ for page in pages:
274
+ yield [dict(employee_id=employee_id, **i) for i in page]
275
+
276
+ @dlt.transformer(
277
+ data_from=custom_reports_list,
278
+ write_disposition="merge",
279
+ primary_key=["report_id", "item_id"],
280
+ )
281
+ @dlt.defer
282
+ def custom_reports(
283
+ custom_reports_item: TDataItem, items_per_page: int = items_per_page
284
+ ) -> Iterable[TDataItem]:
285
+ """
286
+ The transformer for custom reports, supports pagination.
287
+
288
+ Args:
289
+ custom_reports_item: The custom_report data.
290
+ items_per_page: The max number of items to fetch per page. Defaults to 200.
291
+
292
+ Returns:
293
+ Iterable: A generator of employees_absences_balance for each employee.
294
+ """
295
+
296
+ def convert_item(item: TDataItem, report_id: str) -> TDataItem:
297
+ """Converts an employee item."""
298
+ attributes = item.pop("attributes")
299
+ output = dict(report_id=report_id, item_id=list(item.values())[0])
300
+ for value in attributes:
301
+ name = value["attribute_id"]
302
+ if value["data_type"] == "date" and value["value"]:
303
+ output[name] = ensure_pendulum_datetime(value["value"])
304
+ else:
305
+ output[name] = value["value"]
306
+ return output
307
+
308
+ for custom_report in custom_reports_item:
309
+ report_id = custom_report["id"]
310
+ pages = client.get_pages(
311
+ f"company/custom-reports/reports/{report_id}",
312
+ params={"limit": items_per_page},
313
+ offset_by_page=True,
314
+ )
315
+
316
+ for page in pages:
317
+ for report in page:
318
+ report_items = report.get("attributes", {}).get("items", [])
319
+ yield [convert_item(item, report_id) for item in report_items]
320
+
321
+ return (
322
+ employees,
323
+ absence_types,
324
+ absences,
325
+ attendances,
326
+ projects,
327
+ document_categories,
328
+ employees_absences_balance,
329
+ custom_reports_list,
330
+ custom_reports,
331
+ )
@@ -0,0 +1,86 @@
1
+ """Personio source helpers"""
2
+
3
+ from typing import Any, Iterable, Optional
4
+ from urllib.parse import urljoin
5
+
6
+ from dlt.common.typing import Dict, TDataItems
7
+ from dlt.sources.helpers import requests
8
+
9
+
10
+ class PersonioAPI:
11
+ """A Personio API client."""
12
+
13
+ base_url = "https://api.personio.de/v1/"
14
+
15
+ def __init__(self, client_id: str, client_secret: str) -> None:
16
+ """
17
+ Args:
18
+ client_id: The client ID of your app.
19
+ client_secret: The client secret of your app.
20
+ """
21
+ self.client_id = client_id
22
+ self.client_secret = client_secret
23
+ self.access_token = self.get_token()
24
+
25
+ def get_token(self) -> str:
26
+ """Get an access token from Personio.
27
+
28
+ Returns:
29
+ The access token.
30
+ """
31
+ headers = {"Content-Type": "application/json", "Accept": "application/json"}
32
+ data = {"client_id": self.client_id, "client_secret": self.client_secret}
33
+ url = urljoin(self.base_url, "auth")
34
+ response = requests.request("POST", url, headers=headers, json=data)
35
+ json_response = response.json()
36
+ token: str = json_response["data"]["token"]
37
+ return token
38
+
39
+ def get_pages(
40
+ self,
41
+ resource: str,
42
+ params: Optional[Dict[str, Any]] = None,
43
+ offset_by_page: bool = False,
44
+ ) -> Iterable[TDataItems]:
45
+ """Get all pages from Personio using requests.
46
+
47
+ Args:
48
+ resource: The resource to get pages for (e.g. employees, absences, attendances).
49
+ params: The parameters for the resource.
50
+ offset_by_page (bool): If True, offset increases by 1 per page; else, increases by page_size.
51
+
52
+ Yields:
53
+ List of data items from the page
54
+ """
55
+ params = params or {}
56
+ headers = {"Authorization": f"Bearer {self.access_token}"}
57
+ params.update({"offset": int(offset_by_page), "page": int(offset_by_page)})
58
+ url = urljoin(self.base_url, resource)
59
+ starts_from_zero = False
60
+ while True:
61
+ response = requests.get(url, headers=headers, params=params)
62
+ json_response = response.json()
63
+ # Get an item list from the page
64
+ yield json_response["data"]
65
+
66
+ metadata = json_response.get("metadata")
67
+ if not metadata:
68
+ break
69
+
70
+ total_pages = metadata.get("total_pages")
71
+ current_page = metadata.get("current_page")
72
+ if current_page == 0:
73
+ starts_from_zero = True
74
+
75
+ if (
76
+ current_page >= (total_pages - int(starts_from_zero))
77
+ or not json_response["data"]
78
+ ):
79
+ break
80
+
81
+ if offset_by_page:
82
+ params["offset"] += 1
83
+ params["page"] += 1
84
+ else:
85
+ params["offset"] += params["limit"]
86
+ params["page"] += 1
@@ -0,0 +1,65 @@
1
+ from typing import Iterable, Optional
2
+
3
+ import dlt
4
+ import pendulum
5
+ import requests
6
+ from dlt.common.typing import TAnyDateTime, TDataItem
7
+ from dlt.sources import DltResource
8
+ from dlt.sources.helpers.requests import Client
9
+
10
+ from ingestr.src.phantombuster.client import PhantombusterClient
11
+
12
+
13
+ def retry_on_limit(
14
+ response: Optional[requests.Response], exception: Optional[BaseException]
15
+ ) -> bool:
16
+ if response is not None and response.status_code == 429:
17
+ return True
18
+ return False
19
+
20
+
21
+ def create_client() -> requests.Session:
22
+ return Client(
23
+ raise_for_status=False,
24
+ retry_condition=retry_on_limit,
25
+ request_max_attempts=12,
26
+ request_backoff_factor=2,
27
+ ).session
28
+
29
+
30
+ @dlt.source(max_table_nesting=0)
31
+ def phantombuster_source(
32
+ api_key: str, agent_id: str, start_date: TAnyDateTime, end_date: TAnyDateTime | None
33
+ ) -> Iterable[DltResource]:
34
+ client = PhantombusterClient(api_key)
35
+
36
+ @dlt.resource(
37
+ write_disposition="merge",
38
+ primary_key="container_id",
39
+ columns={
40
+ "partition_dt": {"data_type": "date", "partition": True},
41
+ },
42
+ )
43
+ def completed_phantoms(
44
+ dateTime=(
45
+ dlt.sources.incremental(
46
+ "ended_at",
47
+ initial_value=start_date,
48
+ end_value=end_date,
49
+ range_start="closed",
50
+ range_end="closed",
51
+ )
52
+ ),
53
+ ) -> Iterable[TDataItem]:
54
+ if dateTime.end_value is None:
55
+ end_dt = pendulum.now(tz="UTC")
56
+ else:
57
+ end_dt = dateTime.end_value
58
+
59
+ start_dt = dateTime.last_value
60
+
61
+ yield client.fetch_containers_result(
62
+ create_client(), agent_id, start_date=start_dt, end_date=end_dt
63
+ )
64
+
65
+ return completed_phantoms
@@ -0,0 +1,87 @@
1
+ from typing import Union
2
+
3
+ import pendulum
4
+ import requests
5
+
6
+
7
+ class PhantombusterClient:
8
+ def __init__(self, api_key: str):
9
+ self.api_key = api_key
10
+
11
+ def _get_headers(self):
12
+ return {
13
+ "X-Phantombuster-Key-1": self.api_key,
14
+ "accept": "application/json",
15
+ }
16
+
17
+ def fetch_containers_result(
18
+ self,
19
+ session: requests.Session,
20
+ agent_id: str,
21
+ start_date: pendulum.DateTime,
22
+ end_date: pendulum.DateTime,
23
+ ):
24
+ url = "https://api.phantombuster.com/api/v2/containers/fetch-all/"
25
+ before_ended_at = None
26
+ limit = 100
27
+
28
+ started_at = start_date.int_timestamp * 1000 + int(
29
+ start_date.microsecond / 1000
30
+ )
31
+ ended_at = end_date.int_timestamp * 1000 + int(end_date.microsecond / 1000)
32
+
33
+ while True:
34
+ params: dict[str, Union[str, int, float, bytes, None]] = {
35
+ "agentId": agent_id,
36
+ "limit": limit,
37
+ "mode": "finalized",
38
+ }
39
+
40
+ if before_ended_at:
41
+ params["beforeEndedAt"] = before_ended_at
42
+
43
+ response = session.get(url=url, headers=self._get_headers(), params=params)
44
+ data = response.json()
45
+ containers = data.get("containers", [])
46
+
47
+ for container in containers:
48
+ container_ended_at = container.get("endedAt")
49
+
50
+ if before_ended_at is None or before_ended_at > container_ended_at:
51
+ before_ended_at = container_ended_at
52
+
53
+ if container_ended_at < started_at or container_ended_at > ended_at:
54
+ continue
55
+
56
+ try:
57
+ result = self.fetch_result_object(session, container["id"])
58
+ partition_dt = pendulum.from_timestamp(
59
+ container_ended_at / 1000, tz="UTC"
60
+ ).date()
61
+ container_ended_at_datetime = pendulum.from_timestamp(
62
+ container_ended_at / 1000, tz="UTC"
63
+ )
64
+ row = {
65
+ "container_id": container["id"],
66
+ "container": container,
67
+ "result": result,
68
+ "partition_dt": partition_dt,
69
+ "ended_at": container_ended_at_datetime,
70
+ }
71
+ yield row
72
+
73
+ except requests.RequestException as e:
74
+ print(f"Error fetching result for container {container['id']}: {e}")
75
+
76
+ if data["maxLimitReached"] is False:
77
+ break
78
+
79
+ def fetch_result_object(self, session: requests.Session, container_id: str):
80
+ result_url = (
81
+ "https://api.phantombuster.com/api/v2/containers/fetch-result-object"
82
+ )
83
+ params = {"id": container_id}
84
+ response = session.get(result_url, headers=self._get_headers(), params=params)
85
+ response.raise_for_status()
86
+
87
+ return response.json()
@@ -0,0 +1,82 @@
1
+ from typing import Iterable
2
+
3
+ import dlt
4
+ import pendulum
5
+ from dlt.common.time import ensure_pendulum_datetime
6
+ from dlt.common.typing import TDataItem
7
+ from dlt.sources import DltResource
8
+ from dlt.sources.helpers import requests
9
+
10
+
11
+ @dlt.source(name="pinterest", max_table_nesting=0)
12
+ def pinterest_source(
13
+ start_date: pendulum.DateTime,
14
+ access_token: str,
15
+ page_size: int = 200,
16
+ end_date: pendulum.DateTime | None = None,
17
+ ) -> Iterable[DltResource]:
18
+ session = requests.Session()
19
+ session.headers.update({"Authorization": f"Bearer {access_token}"})
20
+ base_url = "https://api.pinterest.com/v5"
21
+
22
+ def fetch_data(
23
+ endpoint: str,
24
+ start_dt: pendulum.DateTime,
25
+ end_dt: pendulum.DateTime,
26
+ ) -> Iterable[TDataItem]:
27
+ url = f"{base_url}/{endpoint}"
28
+ params = {"page_size": page_size}
29
+ bookmark = None
30
+ while True:
31
+ if bookmark:
32
+ params["bookmark"] = bookmark
33
+
34
+ resp = session.get(url, params=params)
35
+ resp.raise_for_status()
36
+ data = resp.json()
37
+ items = data.get("items") or []
38
+
39
+ for item in items:
40
+ item_created = ensure_pendulum_datetime(item["created_at"])
41
+ if item_created <= start_dt:
42
+ continue
43
+ if item_created > end_dt:
44
+ continue
45
+ item["created_at"] = item_created
46
+ yield item
47
+
48
+ bookmark = data.get("bookmark")
49
+ if not bookmark:
50
+ break
51
+
52
+ @dlt.resource(write_disposition="merge", primary_key="id")
53
+ def pins(
54
+ datetime=dlt.sources.incremental(
55
+ "created_at",
56
+ initial_value=start_date,
57
+ end_value=end_date,
58
+ ),
59
+ ) -> Iterable[TDataItem]:
60
+ _start_date = datetime.last_value or start_date
61
+ if end_date is None:
62
+ _end_date = pendulum.now("UTC")
63
+ else:
64
+ _end_date = datetime.end_value
65
+ yield from fetch_data("pins", _start_date, _end_date)
66
+
67
+ @dlt.resource(write_disposition="merge", primary_key="id")
68
+ def boards(
69
+ datetime=dlt.sources.incremental(
70
+ "created_at",
71
+ initial_value=start_date,
72
+ end_value=end_date,
73
+ ),
74
+ ) -> Iterable[TDataItem]:
75
+ _start_date = datetime.last_value or start_date
76
+ if end_date is None:
77
+ _end_date = pendulum.now("UTC")
78
+ else:
79
+ _end_date = datetime.end_value
80
+ yield from fetch_data("boards", _start_date, _end_date)
81
+
82
+ return pins, boards