ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +22 -3
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +169 -1
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +26 -23
- ingestr/src/facebook_ads/helpers.py +47 -1
- ingestr/src/factory.py +48 -0
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +9 -0
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -163
- ingestr/src/frankfurter/helpers.py +3 -3
- ingestr/src/freshdesk/__init__.py +25 -8
- ingestr/src/freshdesk/freshdesk_client.py +40 -5
- ingestr/src/fundraiseup/__init__.py +49 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +6 -4
- ingestr/src/google_analytics/__init__.py +1 -1
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/hubspot/__init__.py +6 -12
- ingestr/src/influxdb/__init__.py +1 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/klaviyo/__init__.py +5 -5
- ingestr/src/linear/__init__.py +553 -116
- ingestr/src/linear/helpers.py +77 -38
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +5 -2
- ingestr/src/mongodb/helpers.py +384 -10
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -1
- ingestr/src/smartsheets/__init__.py +33 -5
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/sources.py +1418 -54
- ingestr/src/stripe_analytics/__init__.py +2 -19
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/tests/unit/test_smartsheets.py +6 -9
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
from typing import Callable, Iterable, Optional
|
|
2
|
+
|
|
3
|
+
import pendulum
|
|
4
|
+
from dlt.sources.helpers.requests import Client
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HostawayClient:
|
|
8
|
+
BASE_URL = "https://api.hostaway.com"
|
|
9
|
+
|
|
10
|
+
def __init__(self, api_key: str) -> None:
|
|
11
|
+
self.session = Client(raise_for_status=False).session
|
|
12
|
+
self.session.headers.update({"Authorization": f"Bearer {api_key}"})
|
|
13
|
+
|
|
14
|
+
def _fetch_single(self, url: str, params: Optional[dict] = None) -> Iterable[dict]:
|
|
15
|
+
response = self.session.get(url, params=params, timeout=30)
|
|
16
|
+
response.raise_for_status()
|
|
17
|
+
response_data = response.json()
|
|
18
|
+
|
|
19
|
+
if isinstance(response_data, dict) and "result" in response_data:
|
|
20
|
+
items = response_data["result"]
|
|
21
|
+
elif isinstance(response_data, list):
|
|
22
|
+
items = response_data
|
|
23
|
+
else:
|
|
24
|
+
items = []
|
|
25
|
+
|
|
26
|
+
if isinstance(items, list):
|
|
27
|
+
for item in items:
|
|
28
|
+
yield item
|
|
29
|
+
elif isinstance(items, dict):
|
|
30
|
+
yield items
|
|
31
|
+
|
|
32
|
+
def _paginate(
|
|
33
|
+
self,
|
|
34
|
+
url: str,
|
|
35
|
+
params: Optional[dict] = None,
|
|
36
|
+
limit: int = 100,
|
|
37
|
+
process_item: Optional[Callable[[dict], dict]] = None,
|
|
38
|
+
) -> Iterable[dict]:
|
|
39
|
+
offset = 0
|
|
40
|
+
if params is None:
|
|
41
|
+
params = {}
|
|
42
|
+
|
|
43
|
+
while True:
|
|
44
|
+
page_params = {**params, "limit": limit, "offset": offset}
|
|
45
|
+
response = self.session.get(url, params=page_params, timeout=30)
|
|
46
|
+
response.raise_for_status()
|
|
47
|
+
response_data = response.json()
|
|
48
|
+
|
|
49
|
+
if isinstance(response_data, dict) and "result" in response_data:
|
|
50
|
+
items = response_data["result"]
|
|
51
|
+
elif isinstance(response_data, list):
|
|
52
|
+
items = response_data
|
|
53
|
+
else:
|
|
54
|
+
items = []
|
|
55
|
+
|
|
56
|
+
if not items or (isinstance(items, list) and len(items) == 0):
|
|
57
|
+
break
|
|
58
|
+
|
|
59
|
+
if isinstance(items, list):
|
|
60
|
+
for item in items:
|
|
61
|
+
if process_item:
|
|
62
|
+
item = process_item(item)
|
|
63
|
+
yield item
|
|
64
|
+
elif isinstance(items, dict):
|
|
65
|
+
if process_item:
|
|
66
|
+
items = process_item(items)
|
|
67
|
+
yield items
|
|
68
|
+
|
|
69
|
+
if isinstance(items, list) and len(items) < limit:
|
|
70
|
+
break
|
|
71
|
+
elif isinstance(items, dict):
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
offset += limit
|
|
75
|
+
|
|
76
|
+
def fetch_listings(
|
|
77
|
+
self,
|
|
78
|
+
start_time: pendulum.DateTime,
|
|
79
|
+
end_time: pendulum.DateTime,
|
|
80
|
+
) -> Iterable[dict]:
|
|
81
|
+
def process_listing(listing: dict) -> dict:
|
|
82
|
+
if "latestActivityOn" in listing and listing["latestActivityOn"]:
|
|
83
|
+
try:
|
|
84
|
+
listing["latestActivityOn"] = pendulum.parse(
|
|
85
|
+
listing["latestActivityOn"]
|
|
86
|
+
)
|
|
87
|
+
except Exception:
|
|
88
|
+
listing["latestActivityOn"] = pendulum.datetime(
|
|
89
|
+
1970, 1, 1, tz="UTC"
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
listing["latestActivityOn"] = pendulum.datetime(1970, 1, 1, tz="UTC")
|
|
93
|
+
return listing
|
|
94
|
+
|
|
95
|
+
url = f"{self.BASE_URL}/v1/listings"
|
|
96
|
+
for listing in self._paginate(url, process_item=process_listing):
|
|
97
|
+
if start_time <= listing["latestActivityOn"] <= end_time:
|
|
98
|
+
yield listing
|
|
99
|
+
|
|
100
|
+
def fetch_listing_fee_settings(
|
|
101
|
+
self,
|
|
102
|
+
listing_id,
|
|
103
|
+
start_time: pendulum.DateTime,
|
|
104
|
+
end_time: pendulum.DateTime,
|
|
105
|
+
) -> Iterable[dict]:
|
|
106
|
+
def process_fee(fee: dict) -> dict:
|
|
107
|
+
if "updatedOn" in fee and fee["updatedOn"]:
|
|
108
|
+
try:
|
|
109
|
+
fee["updatedOn"] = pendulum.parse(fee["updatedOn"])
|
|
110
|
+
except Exception:
|
|
111
|
+
fee["updatedOn"] = pendulum.datetime(1970, 1, 1, tz="UTC")
|
|
112
|
+
else:
|
|
113
|
+
fee["updatedOn"] = pendulum.datetime(1970, 1, 1, tz="UTC")
|
|
114
|
+
return fee
|
|
115
|
+
|
|
116
|
+
url = f"{self.BASE_URL}/v1/listingFeeSettings/{str(listing_id)}"
|
|
117
|
+
for fee in self._paginate(url, process_item=process_fee):
|
|
118
|
+
if start_time <= fee["updatedOn"] <= end_time:
|
|
119
|
+
yield fee
|
|
120
|
+
|
|
121
|
+
def fetch_all_listing_fee_settings(
|
|
122
|
+
self,
|
|
123
|
+
start_time: pendulum.DateTime,
|
|
124
|
+
end_time: pendulum.DateTime,
|
|
125
|
+
) -> Iterable[dict]:
|
|
126
|
+
for listing in self.fetch_listings(start_time, end_time):
|
|
127
|
+
listing_id = listing.get("id")
|
|
128
|
+
if listing_id:
|
|
129
|
+
try:
|
|
130
|
+
yield from self.fetch_listing_fee_settings(
|
|
131
|
+
listing_id, start_time, end_time
|
|
132
|
+
)
|
|
133
|
+
except Exception:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
def fetch_listing_agreement(
|
|
137
|
+
self,
|
|
138
|
+
listing_id,
|
|
139
|
+
) -> Iterable[dict]:
|
|
140
|
+
url = f"{self.BASE_URL}/v1/listingAgreement/{str(listing_id)}"
|
|
141
|
+
yield from self._paginate(url)
|
|
142
|
+
|
|
143
|
+
def fetch_listing_pricing_settings(
|
|
144
|
+
self,
|
|
145
|
+
listing_id,
|
|
146
|
+
) -> Iterable[dict]:
|
|
147
|
+
url = f"{self.BASE_URL}/v1/listing/pricingSettings/{str(listing_id)}"
|
|
148
|
+
yield from self._paginate(url)
|
|
149
|
+
|
|
150
|
+
def fetch_all_listing_pricing_settings(
|
|
151
|
+
self,
|
|
152
|
+
start_time: pendulum.DateTime,
|
|
153
|
+
end_time: pendulum.DateTime,
|
|
154
|
+
) -> Iterable[dict]:
|
|
155
|
+
for listing in self.fetch_listings(start_time, end_time):
|
|
156
|
+
listing_id = listing.get("id")
|
|
157
|
+
if listing_id:
|
|
158
|
+
try:
|
|
159
|
+
yield from self.fetch_listing_pricing_settings(listing_id)
|
|
160
|
+
except Exception:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
def fetch_all_listing_agreements(
|
|
164
|
+
self,
|
|
165
|
+
start_time: pendulum.DateTime,
|
|
166
|
+
end_time: pendulum.DateTime,
|
|
167
|
+
) -> Iterable[dict]:
|
|
168
|
+
for listing in self.fetch_listings(start_time, end_time):
|
|
169
|
+
listing_id = listing.get("id")
|
|
170
|
+
if listing_id:
|
|
171
|
+
try:
|
|
172
|
+
yield from self.fetch_listing_agreement(listing_id)
|
|
173
|
+
except Exception:
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
def fetch_cancellation_policies(self) -> Iterable[dict]:
|
|
177
|
+
url = f"{self.BASE_URL}/v1/cancellationPolicies"
|
|
178
|
+
yield from self._fetch_single(url)
|
|
179
|
+
|
|
180
|
+
def fetch_cancellation_policies_airbnb(self) -> Iterable[dict]:
|
|
181
|
+
url = f"{self.BASE_URL}/v1/cancellationPolicies/airbnb"
|
|
182
|
+
yield from self._fetch_single(url)
|
|
183
|
+
|
|
184
|
+
def fetch_cancellation_policies_marriott(self) -> Iterable[dict]:
|
|
185
|
+
url = f"{self.BASE_URL}/v1/cancellationPolicies/marriott"
|
|
186
|
+
yield from self._fetch_single(url)
|
|
187
|
+
|
|
188
|
+
def fetch_cancellation_policies_vrbo(self) -> Iterable[dict]:
|
|
189
|
+
url = f"{self.BASE_URL}/v1/cancellationPolicies/vrbo"
|
|
190
|
+
yield from self._fetch_single(url)
|
|
191
|
+
|
|
192
|
+
def fetch_reservations(self) -> Iterable[dict]:
|
|
193
|
+
url = f"{self.BASE_URL}/v1/reservations"
|
|
194
|
+
yield from self._paginate(url)
|
|
195
|
+
|
|
196
|
+
def fetch_finance_field(self, reservation_id) -> Iterable[dict]:
|
|
197
|
+
url = f"{self.BASE_URL}/v1/financeField/{str(reservation_id)}"
|
|
198
|
+
yield from self._fetch_single(url)
|
|
199
|
+
|
|
200
|
+
def fetch_all_finance_fields(self) -> Iterable[dict]:
|
|
201
|
+
for reservation in self.fetch_reservations():
|
|
202
|
+
reservation_id = reservation.get("id")
|
|
203
|
+
if reservation_id:
|
|
204
|
+
try:
|
|
205
|
+
yield from self.fetch_finance_field(reservation_id)
|
|
206
|
+
except Exception:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
def fetch_reservation_payment_methods(self) -> Iterable[dict]:
|
|
210
|
+
url = f"{self.BASE_URL}/v1/reservations/paymentMethods"
|
|
211
|
+
yield from self._fetch_single(url)
|
|
212
|
+
|
|
213
|
+
def fetch_reservation_rental_agreement(self, reservation_id) -> Iterable[dict]:
|
|
214
|
+
url = f"{self.BASE_URL}/v1/reservations/{str(reservation_id)}/rentalAgreement"
|
|
215
|
+
try:
|
|
216
|
+
yield from self._fetch_single(url)
|
|
217
|
+
except Exception:
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
def fetch_all_reservation_rental_agreements(self) -> Iterable[dict]:
|
|
221
|
+
for reservation in self.fetch_reservations():
|
|
222
|
+
reservation_id = reservation.get("id")
|
|
223
|
+
if reservation_id:
|
|
224
|
+
try:
|
|
225
|
+
yield from self.fetch_reservation_rental_agreement(reservation_id)
|
|
226
|
+
except Exception:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
def fetch_listing_calendar(self, listing_id) -> Iterable[dict]:
|
|
230
|
+
url = f"{self.BASE_URL}/v1/listings/{str(listing_id)}/calendar"
|
|
231
|
+
yield from self._fetch_single(url)
|
|
232
|
+
|
|
233
|
+
def fetch_all_listing_calendars(
|
|
234
|
+
self,
|
|
235
|
+
start_time: pendulum.DateTime,
|
|
236
|
+
end_time: pendulum.DateTime,
|
|
237
|
+
) -> Iterable[dict]:
|
|
238
|
+
for listing in self.fetch_listings(start_time, end_time):
|
|
239
|
+
listing_id = listing.get("id")
|
|
240
|
+
if listing_id:
|
|
241
|
+
try:
|
|
242
|
+
yield from self.fetch_listing_calendar(listing_id)
|
|
243
|
+
except Exception:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
def fetch_conversations(self) -> Iterable[dict]:
|
|
247
|
+
url = f"{self.BASE_URL}/v1/conversations"
|
|
248
|
+
yield from self._paginate(url)
|
|
249
|
+
|
|
250
|
+
def fetch_message_templates(self) -> Iterable[dict]:
|
|
251
|
+
url = f"{self.BASE_URL}/v1/messageTemplates"
|
|
252
|
+
yield from self._fetch_single(url)
|
|
253
|
+
|
|
254
|
+
def fetch_bed_types(self) -> Iterable[dict]:
|
|
255
|
+
url = f"{self.BASE_URL}/v1/bedTypes"
|
|
256
|
+
yield from self._fetch_single(url)
|
|
257
|
+
|
|
258
|
+
def fetch_property_types(self) -> Iterable[dict]:
|
|
259
|
+
url = f"{self.BASE_URL}/v1/propertyTypes"
|
|
260
|
+
yield from self._fetch_single(url)
|
|
261
|
+
|
|
262
|
+
def fetch_countries(self) -> Iterable[dict]:
|
|
263
|
+
url = f"{self.BASE_URL}/v1/countries"
|
|
264
|
+
yield from self._fetch_single(url)
|
|
265
|
+
|
|
266
|
+
def fetch_account_tax_settings(self) -> Iterable[dict]:
|
|
267
|
+
url = f"{self.BASE_URL}/v1/accountTaxSettings"
|
|
268
|
+
yield from self._fetch_single(url)
|
|
269
|
+
|
|
270
|
+
def fetch_user_groups(self) -> Iterable[dict]:
|
|
271
|
+
url = f"{self.BASE_URL}/v1/userGroups"
|
|
272
|
+
yield from self._fetch_single(url)
|
|
273
|
+
|
|
274
|
+
def fetch_guest_payment_charges(self) -> Iterable[dict]:
|
|
275
|
+
url = f"{self.BASE_URL}/v1/guestPayments/charges"
|
|
276
|
+
yield from self._paginate(url)
|
|
277
|
+
|
|
278
|
+
def fetch_coupons(self) -> Iterable[dict]:
|
|
279
|
+
url = f"{self.BASE_URL}/v1/coupons"
|
|
280
|
+
yield from self._fetch_single(url)
|
|
281
|
+
|
|
282
|
+
def fetch_webhook_reservations(self) -> Iterable[dict]:
|
|
283
|
+
url = f"{self.BASE_URL}/v1/webhooks/reservations"
|
|
284
|
+
yield from self._fetch_single(url)
|
|
285
|
+
|
|
286
|
+
def fetch_tasks(self) -> Iterable[dict]:
|
|
287
|
+
url = f"{self.BASE_URL}/v1/tasks"
|
|
288
|
+
yield from self._fetch_single(url)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""HTTP source for reading CSV, JSON, and Parquet files from public URLs"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .readers import HttpReader
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dlt.source
|
|
12
|
+
def http_source(
|
|
13
|
+
url: str,
|
|
14
|
+
file_format: Optional[str] = None,
|
|
15
|
+
**kwargs: Any,
|
|
16
|
+
) -> DltResource:
|
|
17
|
+
"""Source for reading files from HTTP URLs.
|
|
18
|
+
|
|
19
|
+
Supports CSV, JSON, and Parquet file formats.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
url (str): The HTTP(S) URL to the file
|
|
23
|
+
file_format (str, optional): File format ('csv', 'json', 'parquet').
|
|
24
|
+
If not provided, will be inferred from URL extension.
|
|
25
|
+
**kwargs: Additional arguments passed to the reader functions
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
DltResource: A dlt resource that yields the file data
|
|
29
|
+
"""
|
|
30
|
+
reader = HttpReader(url, file_format)
|
|
31
|
+
|
|
32
|
+
return dlt.resource(
|
|
33
|
+
reader.read_file(**kwargs),
|
|
34
|
+
name="http_data",
|
|
35
|
+
)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Readers for HTTP file sources"""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
from typing import Any, Iterator, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from dlt.sources import TDataItems
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HttpReader:
|
|
12
|
+
"""Reader for HTTP-based file sources"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, url: str, file_format: Optional[str] = None):
|
|
15
|
+
self.url = url
|
|
16
|
+
self.file_format = file_format or self._infer_format(url)
|
|
17
|
+
|
|
18
|
+
if self.file_format not in ["csv", "json", "parquet"]:
|
|
19
|
+
raise ValueError(
|
|
20
|
+
f"Unsupported file format: {self.file_format}. "
|
|
21
|
+
"Supported formats: csv, json, parquet"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
def _infer_format(self, url: str) -> str:
|
|
25
|
+
"""Infer file format from URL extension"""
|
|
26
|
+
parsed = urlparse(url)
|
|
27
|
+
path = parsed.path.lower()
|
|
28
|
+
|
|
29
|
+
if path.endswith(".csv"):
|
|
30
|
+
return "csv"
|
|
31
|
+
elif path.endswith(".json") or path.endswith(".jsonl"):
|
|
32
|
+
return "json"
|
|
33
|
+
elif path.endswith(".parquet"):
|
|
34
|
+
return "parquet"
|
|
35
|
+
else:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"Cannot infer file format from URL: {url}. "
|
|
38
|
+
"Please specify file_format parameter."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def _download_file(self) -> bytes:
|
|
42
|
+
"""Download file from URL"""
|
|
43
|
+
response = requests.get(self.url, stream=True, timeout=30)
|
|
44
|
+
response.raise_for_status()
|
|
45
|
+
return response.content
|
|
46
|
+
|
|
47
|
+
def read_file(self, **kwargs: Any) -> Iterator[TDataItems]:
|
|
48
|
+
"""Read file and yield data in chunks"""
|
|
49
|
+
content = self._download_file()
|
|
50
|
+
|
|
51
|
+
if self.file_format == "csv":
|
|
52
|
+
yield from self._read_csv(content, **kwargs)
|
|
53
|
+
elif self.file_format == "json":
|
|
54
|
+
yield from self._read_json(content, **kwargs)
|
|
55
|
+
elif self.file_format == "parquet":
|
|
56
|
+
yield from self._read_parquet(content, **kwargs)
|
|
57
|
+
|
|
58
|
+
def _read_csv(
|
|
59
|
+
self, content: bytes, chunksize: int = 10000, **pandas_kwargs: Any
|
|
60
|
+
) -> Iterator[TDataItems]:
|
|
61
|
+
"""Read CSV file with Pandas chunk by chunk"""
|
|
62
|
+
import pandas as pd # type: ignore
|
|
63
|
+
|
|
64
|
+
kwargs = {**{"header": "infer", "chunksize": chunksize}, **pandas_kwargs}
|
|
65
|
+
|
|
66
|
+
file_obj = io.BytesIO(content)
|
|
67
|
+
for df in pd.read_csv(file_obj, **kwargs):
|
|
68
|
+
yield df.to_dict(orient="records")
|
|
69
|
+
|
|
70
|
+
def _read_json(
|
|
71
|
+
self, content: bytes, chunksize: int = 1000, **kwargs: Any
|
|
72
|
+
) -> Iterator[TDataItems]:
|
|
73
|
+
"""Read JSON or JSONL file"""
|
|
74
|
+
from dlt.common import json
|
|
75
|
+
|
|
76
|
+
file_obj = io.BytesIO(content)
|
|
77
|
+
text = file_obj.read().decode("utf-8")
|
|
78
|
+
|
|
79
|
+
# Try to detect if it's JSONL format (one JSON object per line)
|
|
80
|
+
lines = text.strip().split("\n")
|
|
81
|
+
|
|
82
|
+
if len(lines) > 1:
|
|
83
|
+
# Likely JSONL format
|
|
84
|
+
lines_chunk = []
|
|
85
|
+
for line in lines:
|
|
86
|
+
if line.strip():
|
|
87
|
+
lines_chunk.append(json.loads(line))
|
|
88
|
+
if len(lines_chunk) >= chunksize:
|
|
89
|
+
yield lines_chunk
|
|
90
|
+
lines_chunk = []
|
|
91
|
+
if lines_chunk:
|
|
92
|
+
yield lines_chunk
|
|
93
|
+
else:
|
|
94
|
+
# Single JSON object or array
|
|
95
|
+
data = json.loads(text)
|
|
96
|
+
if isinstance(data, list):
|
|
97
|
+
# Chunk the list
|
|
98
|
+
for i in range(0, len(data), chunksize):
|
|
99
|
+
yield data[i : i + chunksize]
|
|
100
|
+
else:
|
|
101
|
+
# Single object
|
|
102
|
+
yield [data]
|
|
103
|
+
|
|
104
|
+
def _read_parquet(
|
|
105
|
+
self, content: bytes, chunksize: int = 10000, **kwargs: Any
|
|
106
|
+
) -> Iterator[TDataItems]:
|
|
107
|
+
"""Read Parquet file"""
|
|
108
|
+
from pyarrow import parquet as pq # type: ignore
|
|
109
|
+
|
|
110
|
+
file_obj = io.BytesIO(content)
|
|
111
|
+
parquet_file = pq.ParquetFile(file_obj)
|
|
112
|
+
|
|
113
|
+
for batch in parquet_file.iter_batches(batch_size=chunksize):
|
|
114
|
+
yield batch.to_pylist()
|
ingestr/src/hubspot/__init__.py
CHANGED
|
@@ -93,7 +93,6 @@ def hubspot(
|
|
|
93
93
|
def companies(
|
|
94
94
|
api_key: str = api_key,
|
|
95
95
|
include_history: bool = include_history,
|
|
96
|
-
props: Sequence[str] = DEFAULT_COMPANY_PROPS,
|
|
97
96
|
include_custom_props: bool = include_custom_props,
|
|
98
97
|
) -> Iterator[TDataItems]:
|
|
99
98
|
"""Hubspot companies resource"""
|
|
@@ -101,7 +100,7 @@ def hubspot(
|
|
|
101
100
|
"company",
|
|
102
101
|
api_key,
|
|
103
102
|
include_history=include_history,
|
|
104
|
-
props=
|
|
103
|
+
props=DEFAULT_COMPANY_PROPS,
|
|
105
104
|
include_custom_props=include_custom_props,
|
|
106
105
|
)
|
|
107
106
|
|
|
@@ -109,7 +108,6 @@ def hubspot(
|
|
|
109
108
|
def contacts(
|
|
110
109
|
api_key: str = api_key,
|
|
111
110
|
include_history: bool = include_history,
|
|
112
|
-
props: Sequence[str] = DEFAULT_CONTACT_PROPS,
|
|
113
111
|
include_custom_props: bool = include_custom_props,
|
|
114
112
|
) -> Iterator[TDataItems]:
|
|
115
113
|
"""Hubspot contacts resource"""
|
|
@@ -117,7 +115,7 @@ def hubspot(
|
|
|
117
115
|
"contact",
|
|
118
116
|
api_key,
|
|
119
117
|
include_history,
|
|
120
|
-
|
|
118
|
+
DEFAULT_CONTACT_PROPS,
|
|
121
119
|
include_custom_props,
|
|
122
120
|
)
|
|
123
121
|
|
|
@@ -125,7 +123,6 @@ def hubspot(
|
|
|
125
123
|
def deals(
|
|
126
124
|
api_key: str = api_key,
|
|
127
125
|
include_history: bool = include_history,
|
|
128
|
-
props: Sequence[str] = DEFAULT_DEAL_PROPS,
|
|
129
126
|
include_custom_props: bool = include_custom_props,
|
|
130
127
|
) -> Iterator[TDataItems]:
|
|
131
128
|
"""Hubspot deals resource"""
|
|
@@ -133,7 +130,7 @@ def hubspot(
|
|
|
133
130
|
"deal",
|
|
134
131
|
api_key,
|
|
135
132
|
include_history,
|
|
136
|
-
|
|
133
|
+
DEFAULT_DEAL_PROPS,
|
|
137
134
|
include_custom_props,
|
|
138
135
|
)
|
|
139
136
|
|
|
@@ -141,7 +138,6 @@ def hubspot(
|
|
|
141
138
|
def tickets(
|
|
142
139
|
api_key: str = api_key,
|
|
143
140
|
include_history: bool = include_history,
|
|
144
|
-
props: Sequence[str] = DEFAULT_TICKET_PROPS,
|
|
145
141
|
include_custom_props: bool = include_custom_props,
|
|
146
142
|
) -> Iterator[TDataItems]:
|
|
147
143
|
"""Hubspot tickets resource"""
|
|
@@ -149,7 +145,7 @@ def hubspot(
|
|
|
149
145
|
"ticket",
|
|
150
146
|
api_key,
|
|
151
147
|
include_history,
|
|
152
|
-
|
|
148
|
+
DEFAULT_TICKET_PROPS,
|
|
153
149
|
include_custom_props,
|
|
154
150
|
)
|
|
155
151
|
|
|
@@ -157,7 +153,6 @@ def hubspot(
|
|
|
157
153
|
def products(
|
|
158
154
|
api_key: str = api_key,
|
|
159
155
|
include_history: bool = include_history,
|
|
160
|
-
props: Sequence[str] = DEFAULT_PRODUCT_PROPS,
|
|
161
156
|
include_custom_props: bool = include_custom_props,
|
|
162
157
|
) -> Iterator[TDataItems]:
|
|
163
158
|
"""Hubspot products resource"""
|
|
@@ -165,7 +160,7 @@ def hubspot(
|
|
|
165
160
|
"product",
|
|
166
161
|
api_key,
|
|
167
162
|
include_history,
|
|
168
|
-
|
|
163
|
+
DEFAULT_PRODUCT_PROPS,
|
|
169
164
|
include_custom_props,
|
|
170
165
|
)
|
|
171
166
|
|
|
@@ -180,7 +175,6 @@ def hubspot(
|
|
|
180
175
|
def quotes(
|
|
181
176
|
api_key: str = api_key,
|
|
182
177
|
include_history: bool = include_history,
|
|
183
|
-
props: Sequence[str] = DEFAULT_QUOTE_PROPS,
|
|
184
178
|
include_custom_props: bool = include_custom_props,
|
|
185
179
|
) -> Iterator[TDataItems]:
|
|
186
180
|
"""Hubspot quotes resource"""
|
|
@@ -188,7 +182,7 @@ def hubspot(
|
|
|
188
182
|
"quote",
|
|
189
183
|
api_key,
|
|
190
184
|
include_history,
|
|
191
|
-
|
|
185
|
+
DEFAULT_QUOTE_PROPS,
|
|
192
186
|
include_custom_props,
|
|
193
187
|
)
|
|
194
188
|
|
ingestr/src/influxdb/__init__.py
CHANGED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Intercom source implementation for data ingestion.
|
|
3
|
+
|
|
4
|
+
This module provides DLT sources for retrieving data from Intercom API endpoints
|
|
5
|
+
including contacts, companies, conversations, tickets, and more.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Optional, Sequence
|
|
9
|
+
|
|
10
|
+
import dlt
|
|
11
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
12
|
+
from dlt.common.typing import TAnyDateTime
|
|
13
|
+
from dlt.sources import DltResource, DltSource
|
|
14
|
+
|
|
15
|
+
from .helpers import (
|
|
16
|
+
IntercomAPIClient,
|
|
17
|
+
IntercomCredentialsAccessToken,
|
|
18
|
+
TIntercomCredentials,
|
|
19
|
+
convert_datetime_to_timestamp,
|
|
20
|
+
create_resource_from_config,
|
|
21
|
+
transform_company,
|
|
22
|
+
transform_contact,
|
|
23
|
+
transform_conversation,
|
|
24
|
+
)
|
|
25
|
+
from .helpers import (
|
|
26
|
+
IntercomCredentialsOAuth as IntercomCredentialsOAuth,
|
|
27
|
+
)
|
|
28
|
+
from .settings import (
|
|
29
|
+
DEFAULT_START_DATE,
|
|
30
|
+
RESOURCE_CONFIGS,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dlt.source(name="intercom", max_table_nesting=0)
|
|
35
|
+
def intercom_source(
|
|
36
|
+
credentials: TIntercomCredentials = dlt.secrets.value,
|
|
37
|
+
start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE,
|
|
38
|
+
end_date: Optional[TAnyDateTime] = None,
|
|
39
|
+
) -> Sequence[DltResource]:
|
|
40
|
+
"""
|
|
41
|
+
A DLT source that retrieves data from Intercom API.
|
|
42
|
+
|
|
43
|
+
This source provides access to various Intercom resources including contacts,
|
|
44
|
+
companies, conversations, tickets, and more. It supports incremental loading
|
|
45
|
+
for resources that track updated timestamps.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
credentials: Intercom API credentials (AccessToken or OAuth).
|
|
49
|
+
Defaults to dlt.secrets.value.
|
|
50
|
+
start_date: The start date for incremental loading.
|
|
51
|
+
Defaults to January 1, 2020.
|
|
52
|
+
end_date: Optional end date for incremental loading.
|
|
53
|
+
If not provided, loads all data from start_date to present.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Sequence of DLT resources for different Intercom endpoints.
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> source = intercom_source(
|
|
60
|
+
... credentials=IntercomCredentialsAccessToken(
|
|
61
|
+
... access_token="your_token",
|
|
62
|
+
... region="us"
|
|
63
|
+
... ),
|
|
64
|
+
... start_date=datetime(2024, 1, 1)
|
|
65
|
+
... )
|
|
66
|
+
"""
|
|
67
|
+
# Initialize API client
|
|
68
|
+
api_client = IntercomAPIClient(credentials)
|
|
69
|
+
|
|
70
|
+
# Convert dates to pendulum and then to unix timestamps for Intercom API
|
|
71
|
+
start_date_obj = ensure_pendulum_datetime(start_date) if start_date else None
|
|
72
|
+
end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None
|
|
73
|
+
|
|
74
|
+
# Convert to unix timestamps for API compatibility
|
|
75
|
+
# Use default start date if none provided
|
|
76
|
+
if not start_date_obj:
|
|
77
|
+
from .settings import DEFAULT_START_DATE
|
|
78
|
+
|
|
79
|
+
start_date_obj = ensure_pendulum_datetime(DEFAULT_START_DATE)
|
|
80
|
+
|
|
81
|
+
start_timestamp = convert_datetime_to_timestamp(start_date_obj)
|
|
82
|
+
end_timestamp = (
|
|
83
|
+
convert_datetime_to_timestamp(end_date_obj) if end_date_obj else None
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Transform function mapping
|
|
87
|
+
transform_functions = {
|
|
88
|
+
"transform_contact": transform_contact,
|
|
89
|
+
"transform_company": transform_company,
|
|
90
|
+
"transform_conversation": transform_conversation,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Generate all resources from configuration
|
|
94
|
+
resources = []
|
|
95
|
+
for resource_name, config in RESOURCE_CONFIGS.items():
|
|
96
|
+
resource_func = create_resource_from_config(
|
|
97
|
+
resource_name,
|
|
98
|
+
config,
|
|
99
|
+
api_client,
|
|
100
|
+
start_timestamp,
|
|
101
|
+
end_timestamp,
|
|
102
|
+
transform_functions,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Call the resource function to get the actual resource
|
|
106
|
+
resources.append(resource_func())
|
|
107
|
+
|
|
108
|
+
return resources
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def intercom(
|
|
112
|
+
api_key: str,
|
|
113
|
+
region: str = "us",
|
|
114
|
+
start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE,
|
|
115
|
+
end_date: Optional[TAnyDateTime] = None,
|
|
116
|
+
) -> DltSource:
|
|
117
|
+
"""
|
|
118
|
+
Convenience function to create Intercom source with access token.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
api_key: Intercom API access token.
|
|
122
|
+
region: Data region (us, eu, or au). Defaults to "us".
|
|
123
|
+
start_date: Start date for incremental loading.
|
|
124
|
+
end_date: Optional end date for incremental loading.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Sequence of DLT resources.
|
|
128
|
+
|
|
129
|
+
Example:
|
|
130
|
+
>>> source = intercom(
|
|
131
|
+
... api_key="your_access_token",
|
|
132
|
+
... region="us",
|
|
133
|
+
... start_date=datetime(2024, 1, 1)
|
|
134
|
+
... )
|
|
135
|
+
"""
|
|
136
|
+
credentials = IntercomCredentialsAccessToken(access_token=api_key, region=region)
|
|
137
|
+
|
|
138
|
+
return intercom_source(
|
|
139
|
+
credentials=credentials,
|
|
140
|
+
start_date=start_date,
|
|
141
|
+
end_date=end_date,
|
|
142
|
+
)
|