ingestr 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +11 -1
- ingestr/src/.gitignore +10 -0
- ingestr/src/airtable/__init__.py +69 -0
- ingestr/src/facebook_ads/__init__.py +197 -0
- ingestr/src/facebook_ads/exceptions.py +5 -0
- ingestr/src/facebook_ads/helpers.py +255 -0
- ingestr/src/facebook_ads/settings.py +208 -0
- ingestr/src/factory.py +15 -0
- ingestr/src/kafka/__init__.py +103 -0
- ingestr/src/kafka/helpers.py +227 -0
- ingestr/src/klaviyo/_init_.py +173 -0
- ingestr/src/klaviyo/client.py +212 -0
- ingestr/src/klaviyo/helpers.py +19 -0
- ingestr/src/slack/__init__.py +272 -0
- ingestr/src/slack/helpers.py +204 -0
- ingestr/src/slack/settings.py +22 -0
- ingestr/src/sources.py +222 -1
- ingestr/src/version.py +1 -1
- {ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/METADATA +31 -5
- {ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/RECORD +23 -9
- {ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/WHEEL +0 -0
- {ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/entry_points.txt +0 -0
- {ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from typing import Iterable
|
|
2
|
+
|
|
3
|
+
import dlt
|
|
4
|
+
import pendulum
|
|
5
|
+
import requests
|
|
6
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
7
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
from dlt.sources.helpers.requests import Client
|
|
10
|
+
|
|
11
|
+
from ingestr.src.klaviyo.client import KlaviyoClient
|
|
12
|
+
from ingestr.src.klaviyo.helpers import split_date_range
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
|
|
16
|
+
return response.status_code == 429
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_client() -> requests.Session:
|
|
20
|
+
return Client(
|
|
21
|
+
request_timeout=10.0,
|
|
22
|
+
raise_for_status=False,
|
|
23
|
+
retry_condition=retry_on_limit,
|
|
24
|
+
request_max_attempts=12,
|
|
25
|
+
request_backoff_factor=2,
|
|
26
|
+
).session
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dlt.source(max_table_nesting=0)
|
|
30
|
+
def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResource]:
|
|
31
|
+
start_date_obj = ensure_pendulum_datetime(start_date)
|
|
32
|
+
client = KlaviyoClient(api_key)
|
|
33
|
+
|
|
34
|
+
@dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
|
|
35
|
+
def events(
|
|
36
|
+
datetime=dlt.sources.incremental("datetime", start_date_obj.isoformat()),
|
|
37
|
+
) -> Iterable[TDataItem]:
|
|
38
|
+
intervals = split_date_range(
|
|
39
|
+
pendulum.parse(datetime.start_value), pendulum.now()
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
for start, end in intervals:
|
|
43
|
+
yield lambda s=start, e=end: client.fetch_events(create_client(), s, e)
|
|
44
|
+
|
|
45
|
+
@dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
|
|
46
|
+
def profiles(
|
|
47
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
48
|
+
) -> Iterable[TDataItem]:
|
|
49
|
+
intervals = split_date_range(
|
|
50
|
+
pendulum.parse(updated.start_value), pendulum.now()
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
for start, end in intervals:
|
|
54
|
+
yield lambda s=start, e=end: client.fetch_profiles(create_client(), s, e)
|
|
55
|
+
|
|
56
|
+
@dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
|
|
57
|
+
def campaigns(
|
|
58
|
+
updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
|
|
59
|
+
) -> Iterable[TDataItem]:
|
|
60
|
+
intervals = split_date_range(
|
|
61
|
+
pendulum.parse(updated_at.start_value), pendulum.now()
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
for campaign_type in ["email", "sms"]:
|
|
65
|
+
for start, end in intervals:
|
|
66
|
+
yield lambda s=start, e=end, ct=campaign_type: client.fetch_campaigns(
|
|
67
|
+
create_client(), s, e, ct
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
71
|
+
def metrics(
|
|
72
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
73
|
+
) -> Iterable[TDataItem]:
|
|
74
|
+
yield from client.fetch_metrics(create_client(), updated.start_value)
|
|
75
|
+
|
|
76
|
+
@dlt.resource(write_disposition="replace", primary_key="id")
|
|
77
|
+
def tags() -> Iterable[TAnyDateTime]:
|
|
78
|
+
yield from client.fetch_tag(create_client())
|
|
79
|
+
|
|
80
|
+
@dlt.resource(write_disposition="replace", primary_key="id")
|
|
81
|
+
def coupons() -> Iterable[TAnyDateTime]:
|
|
82
|
+
yield from client.fetch_coupons(create_client())
|
|
83
|
+
|
|
84
|
+
@dlt.resource(write_disposition="merge", primary_key="id", name="catalog-variants")
|
|
85
|
+
def catalog_variants(
|
|
86
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
87
|
+
) -> Iterable[TDataItem]:
|
|
88
|
+
yield from client.fetch_catalog_variant(create_client(), updated.start_value)
|
|
89
|
+
|
|
90
|
+
@dlt.resource(
|
|
91
|
+
write_disposition="merge", primary_key="id", name="catalog-categories"
|
|
92
|
+
)
|
|
93
|
+
def catalog_categories(
|
|
94
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
95
|
+
) -> Iterable[TDataItem]:
|
|
96
|
+
yield from client.fetch_catalog_categories(create_client(), updated.start_value)
|
|
97
|
+
|
|
98
|
+
@dlt.resource(write_disposition="merge", primary_key="id", name="catalog-items")
|
|
99
|
+
def catalog_items(
|
|
100
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
101
|
+
) -> Iterable[TDataItem]:
|
|
102
|
+
yield from client.fetch_catalog_item(create_client(), updated.start_value)
|
|
103
|
+
|
|
104
|
+
@dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
|
|
105
|
+
def forms(
|
|
106
|
+
updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
|
|
107
|
+
) -> Iterable[TDataItem]:
|
|
108
|
+
intervals = split_date_range(
|
|
109
|
+
pendulum.parse(updated_at.start_value), pendulum.now()
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
for start, end in intervals:
|
|
113
|
+
yield lambda s=start, e=end: client.fetch_forms(create_client(), s, e)
|
|
114
|
+
|
|
115
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
116
|
+
def lists(
|
|
117
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
118
|
+
) -> Iterable[TDataItem]:
|
|
119
|
+
yield from client.fetch_lists(create_client(), updated.start_value)
|
|
120
|
+
|
|
121
|
+
@dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
|
|
122
|
+
def images(
|
|
123
|
+
updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
|
|
124
|
+
) -> Iterable[TDataItem]:
|
|
125
|
+
intervals = split_date_range(
|
|
126
|
+
pendulum.parse(updated_at.start_value), pendulum.now()
|
|
127
|
+
)
|
|
128
|
+
for start, end in intervals:
|
|
129
|
+
yield lambda s=start, e=end: client.fetch_images(create_client(), s, e)
|
|
130
|
+
|
|
131
|
+
@dlt.resource(write_disposition="merge", primary_key="id")
|
|
132
|
+
def segments(
|
|
133
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
134
|
+
) -> Iterable[TDataItem]:
|
|
135
|
+
yield from client.fetch_segments(create_client(), updated.start_value)
|
|
136
|
+
|
|
137
|
+
@dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
|
|
138
|
+
def flows(
|
|
139
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
140
|
+
) -> Iterable[TDataItem]:
|
|
141
|
+
intervals = split_date_range(
|
|
142
|
+
pendulum.parse(updated.start_value), pendulum.now()
|
|
143
|
+
)
|
|
144
|
+
for start, end in intervals:
|
|
145
|
+
yield lambda s=start, e=end: client.fetch_flows(create_client(), s, e)
|
|
146
|
+
|
|
147
|
+
@dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
|
|
148
|
+
def templates(
|
|
149
|
+
updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
|
|
150
|
+
) -> Iterable[TDataItem]:
|
|
151
|
+
intervals = split_date_range(
|
|
152
|
+
pendulum.parse(updated.start_value), pendulum.now()
|
|
153
|
+
)
|
|
154
|
+
for start, end in intervals:
|
|
155
|
+
yield lambda s=start, e=end: client.fetch_templates(create_client(), s, e)
|
|
156
|
+
|
|
157
|
+
return (
|
|
158
|
+
events,
|
|
159
|
+
profiles,
|
|
160
|
+
campaigns,
|
|
161
|
+
metrics,
|
|
162
|
+
tags,
|
|
163
|
+
coupons,
|
|
164
|
+
catalog_variants,
|
|
165
|
+
catalog_categories,
|
|
166
|
+
catalog_items,
|
|
167
|
+
forms,
|
|
168
|
+
lists,
|
|
169
|
+
images,
|
|
170
|
+
segments,
|
|
171
|
+
flows,
|
|
172
|
+
templates,
|
|
173
|
+
)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from urllib.parse import urlencode
|
|
2
|
+
|
|
3
|
+
import pendulum
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
BASE_URL = "https://a.klaviyo.com/api"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KlaviyoClient:
|
|
10
|
+
def __init__(self, api_key: str):
|
|
11
|
+
self.api_key = api_key
|
|
12
|
+
|
|
13
|
+
def __get_headers(self):
|
|
14
|
+
return {
|
|
15
|
+
"Authorization": f"Klaviyo-API-Key {self.api_key}",
|
|
16
|
+
"accept": "application/json",
|
|
17
|
+
"revision": "2024-07-15",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def _flatten_attributes(self, items: list):
|
|
21
|
+
for event in items:
|
|
22
|
+
if "attributes" not in event:
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
for attribute_key in event["attributes"]:
|
|
26
|
+
event[attribute_key] = event["attributes"][attribute_key]
|
|
27
|
+
|
|
28
|
+
del event["attributes"]
|
|
29
|
+
return items
|
|
30
|
+
|
|
31
|
+
def _fetch_pages(
|
|
32
|
+
self, session: requests.Session, url: str, flat: bool = True
|
|
33
|
+
) -> list:
|
|
34
|
+
all_items = []
|
|
35
|
+
while True:
|
|
36
|
+
response = session.get(url=url, headers=self.__get_headers())
|
|
37
|
+
result = response.json()
|
|
38
|
+
items = result.get("data", [])
|
|
39
|
+
|
|
40
|
+
if flat:
|
|
41
|
+
items = self._flatten_attributes(items)
|
|
42
|
+
|
|
43
|
+
all_items.extend(items)
|
|
44
|
+
nextURL = result.get("links", {}).get("next")
|
|
45
|
+
if nextURL is None:
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
url = nextURL
|
|
49
|
+
|
|
50
|
+
return all_items
|
|
51
|
+
|
|
52
|
+
def fetch_events(
|
|
53
|
+
self,
|
|
54
|
+
session: requests.Session,
|
|
55
|
+
start_date: str,
|
|
56
|
+
end_date: str,
|
|
57
|
+
):
|
|
58
|
+
print(f"Fetching events for {start_date} to {end_date}")
|
|
59
|
+
url = f"{BASE_URL}/events/?sort=-datetime&filter=and(greater-or-equal(datetime,{start_date}),less-than(datetime,{end_date}))"
|
|
60
|
+
return self._fetch_pages(session, url)
|
|
61
|
+
|
|
62
|
+
def fetch_metrics(
|
|
63
|
+
self,
|
|
64
|
+
session: requests.Session,
|
|
65
|
+
last_updated: str,
|
|
66
|
+
):
|
|
67
|
+
print(f"Fetching metrics since {last_updated}")
|
|
68
|
+
url = f"{BASE_URL}/metrics"
|
|
69
|
+
items = self._fetch_pages(session, url)
|
|
70
|
+
|
|
71
|
+
last_updated_obj = pendulum.parse(last_updated)
|
|
72
|
+
for item in items:
|
|
73
|
+
updated_at = pendulum.parse(item["updated"])
|
|
74
|
+
if updated_at > last_updated_obj:
|
|
75
|
+
yield item
|
|
76
|
+
|
|
77
|
+
def fetch_profiles(
|
|
78
|
+
self,
|
|
79
|
+
session: requests.Session,
|
|
80
|
+
start_date: str,
|
|
81
|
+
end_date: str,
|
|
82
|
+
):
|
|
83
|
+
pendulum_start_date = pendulum.parse(start_date)
|
|
84
|
+
pendulum_start_date = pendulum_start_date.subtract(seconds=1)
|
|
85
|
+
url = f"{BASE_URL}/profiles/?sort=updated&filter=and(greater-than(updated,{pendulum_start_date.isoformat()}),less-than(updated,{end_date}))"
|
|
86
|
+
return self._fetch_pages(session, url)
|
|
87
|
+
|
|
88
|
+
def fetch_campaigns(
|
|
89
|
+
self,
|
|
90
|
+
session: requests.Session,
|
|
91
|
+
start_date: str,
|
|
92
|
+
end_date: str,
|
|
93
|
+
campaign_type: str,
|
|
94
|
+
):
|
|
95
|
+
print(f"Fetching {campaign_type} campaigns for {start_date} to {end_date}")
|
|
96
|
+
|
|
97
|
+
base_url = f"{BASE_URL}/campaigns/"
|
|
98
|
+
params = {
|
|
99
|
+
"sort": "updated_at",
|
|
100
|
+
"filter": f"and(equals(messages.channel,'{campaign_type}'),greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))",
|
|
101
|
+
}
|
|
102
|
+
url = f"{base_url}?{urlencode(params)}"
|
|
103
|
+
pages = self._fetch_pages(session, url)
|
|
104
|
+
for page in pages:
|
|
105
|
+
page["campaign_type"] = campaign_type
|
|
106
|
+
|
|
107
|
+
return pages
|
|
108
|
+
|
|
109
|
+
def fetch_tag(self, session: requests.Session):
|
|
110
|
+
url = f"{BASE_URL}/tags"
|
|
111
|
+
return self._fetch_pages(session, url, False)
|
|
112
|
+
|
|
113
|
+
def fetch_catalog_variant(
|
|
114
|
+
self,
|
|
115
|
+
session: requests.Session,
|
|
116
|
+
last_updated: str,
|
|
117
|
+
):
|
|
118
|
+
url = f"{BASE_URL}/catalog-variants"
|
|
119
|
+
items = self._fetch_pages(session, url)
|
|
120
|
+
last_updated_obj = pendulum.parse(last_updated)
|
|
121
|
+
|
|
122
|
+
for item in items:
|
|
123
|
+
updated_at = pendulum.parse(item["updated"])
|
|
124
|
+
if updated_at > last_updated_obj:
|
|
125
|
+
yield item
|
|
126
|
+
|
|
127
|
+
def fetch_coupons(self, session: requests.Session):
|
|
128
|
+
url = f"{BASE_URL}/coupons"
|
|
129
|
+
return self._fetch_pages(session, url, False)
|
|
130
|
+
|
|
131
|
+
def fetch_catalog_categories(
|
|
132
|
+
self,
|
|
133
|
+
session: requests.Session,
|
|
134
|
+
last_updated: str,
|
|
135
|
+
):
|
|
136
|
+
url = f"{BASE_URL}/catalog-categories"
|
|
137
|
+
items = self._fetch_pages(session, url)
|
|
138
|
+
last_updated_obj = pendulum.parse(last_updated)
|
|
139
|
+
|
|
140
|
+
for item in items:
|
|
141
|
+
updated_at = pendulum.parse(item["updated"])
|
|
142
|
+
if updated_at > last_updated_obj:
|
|
143
|
+
yield item
|
|
144
|
+
|
|
145
|
+
def fetch_catalog_item(
|
|
146
|
+
self,
|
|
147
|
+
session: requests.Session,
|
|
148
|
+
last_updated: str,
|
|
149
|
+
):
|
|
150
|
+
url = f"{BASE_URL}/catalog-items"
|
|
151
|
+
items = self._fetch_pages(session, url)
|
|
152
|
+
last_updated_obj = pendulum.parse(last_updated)
|
|
153
|
+
|
|
154
|
+
for item in items:
|
|
155
|
+
updated_at = pendulum.parse(item["updated"])
|
|
156
|
+
if updated_at > last_updated_obj:
|
|
157
|
+
yield item
|
|
158
|
+
|
|
159
|
+
def fetch_forms(
|
|
160
|
+
self,
|
|
161
|
+
session: requests.Session,
|
|
162
|
+
start_date: str,
|
|
163
|
+
end_date: str,
|
|
164
|
+
):
|
|
165
|
+
print(f"Fetching forms for {start_date} to {end_date}")
|
|
166
|
+
url = f"{BASE_URL}/forms/?sort=-updated_at&filter=and(greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))"
|
|
167
|
+
return self._fetch_pages(session, url)
|
|
168
|
+
|
|
169
|
+
def fetch_lists(
|
|
170
|
+
self,
|
|
171
|
+
session: requests.Session,
|
|
172
|
+
updated_date: str,
|
|
173
|
+
):
|
|
174
|
+
# https://a.klaviyo.com/api/lists/?sort=-updated&filter=greater-than(updated,2024-02-01 00:00:00+00:00)
|
|
175
|
+
url = f"{BASE_URL}/lists/?sort=-updated&filter=greater-than(updated,{updated_date})"
|
|
176
|
+
return self._fetch_pages(session, url)
|
|
177
|
+
|
|
178
|
+
def fetch_images(self, session: requests.Session, start_date: str, end_date: str):
|
|
179
|
+
# https://a.klaviyo.com/api/images/?sort=-updated_at&filter=greater-or-equal(updated_at,2024-06-01 00:00:00+00:00),less-than(updated_at,2024-09-01 00:00:00+00:00)
|
|
180
|
+
url = f"{BASE_URL}/images/?sort=-updated_at&filter=and(greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))"
|
|
181
|
+
return self._fetch_pages(session, url)
|
|
182
|
+
|
|
183
|
+
def fetch_segments(
|
|
184
|
+
self,
|
|
185
|
+
session: requests.Session,
|
|
186
|
+
updated_date: str,
|
|
187
|
+
):
|
|
188
|
+
# https://a.klaviyo.com/api/segments/?sort=-updated&filter=greater-than(updated,2024-04-01 00:00:00+00:00)
|
|
189
|
+
url = f"{BASE_URL}/segments/?sort=-updated&filter=greater-than(updated,{updated_date})"
|
|
190
|
+
print("url", url)
|
|
191
|
+
return self._fetch_pages(session, url)
|
|
192
|
+
|
|
193
|
+
def fetch_flows(
|
|
194
|
+
self,
|
|
195
|
+
session: requests.Session,
|
|
196
|
+
start_date: str,
|
|
197
|
+
end_date: str,
|
|
198
|
+
):
|
|
199
|
+
print(f"Fetching events for {start_date} to {end_date}")
|
|
200
|
+
# https://a.klaviyo.com/api/flows/?sort=-updated&filter=and(greater-or-equal(updated,2024-06-01 00:00:00+00:00),less-than(updated,2024-09-01 00:00:00+00:00))
|
|
201
|
+
url = f"{BASE_URL}/flows/?sort=-updated&filter=and(greater-or-equal(updated,{start_date}),less-than(updated,{end_date}))"
|
|
202
|
+
return self._fetch_pages(session, url)
|
|
203
|
+
|
|
204
|
+
def fetch_templates(
|
|
205
|
+
self,
|
|
206
|
+
session: requests.Session,
|
|
207
|
+
start_date: str,
|
|
208
|
+
end_date: str,
|
|
209
|
+
):
|
|
210
|
+
# https://a.klaviyo.com/api/templates/?sort=-updated&filter=and(greater-or-equal(updated,2024-06-01 00:00:00+00:00),less-than(updated,2024-09-01 00:00:00+00:00))
|
|
211
|
+
url = f"{BASE_URL}/templates/?sort=-updated&filter=and(greater-or-equal(updated,{start_date}),less-than(updated,{end_date}))"
|
|
212
|
+
return self._fetch_pages(session, url)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import pendulum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def split_date_range(
|
|
7
|
+
start_date: pendulum.DateTime, end_date: pendulum.DateTime
|
|
8
|
+
) -> List[tuple]:
|
|
9
|
+
interval = "days"
|
|
10
|
+
if (end_date - start_date).days <= 1:
|
|
11
|
+
interval = "hours"
|
|
12
|
+
|
|
13
|
+
intervals = []
|
|
14
|
+
current = start_date
|
|
15
|
+
while current < end_date:
|
|
16
|
+
next_date = min(current.add(**{interval: 1}), end_date)
|
|
17
|
+
intervals.append((current.isoformat(), next_date.isoformat()))
|
|
18
|
+
current = next_date
|
|
19
|
+
return intervals
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Fetches Slack Conversations, History and logs."""
|
|
2
|
+
|
|
3
|
+
from functools import partial
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
8
|
+
from dlt.sources import DltResource
|
|
9
|
+
from pendulum import DateTime
|
|
10
|
+
|
|
11
|
+
from .helpers import SlackAPI, ensure_dt_type
|
|
12
|
+
from .settings import (
|
|
13
|
+
DEFAULT_DATETIME_FIELDS,
|
|
14
|
+
DEFAULT_START_DATE,
|
|
15
|
+
MAX_PAGE_SIZE,
|
|
16
|
+
MSG_DATETIME_FIELDS,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dlt.source(name="slack", max_table_nesting=0)
|
|
21
|
+
def slack_source(
|
|
22
|
+
page_size: int = MAX_PAGE_SIZE,
|
|
23
|
+
access_token: str = dlt.secrets.value,
|
|
24
|
+
start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE,
|
|
25
|
+
end_date: Optional[TAnyDateTime] = None,
|
|
26
|
+
selected_channels: Optional[List[str]] = dlt.config.value,
|
|
27
|
+
table_per_channel: bool = True,
|
|
28
|
+
replies: bool = False,
|
|
29
|
+
) -> Iterable[DltResource]:
|
|
30
|
+
"""
|
|
31
|
+
The source for the Slack pipeline. Available resources are conversations, conversations_history
|
|
32
|
+
and access_logs.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
page_size: The max number of items to fetch per page. Defaults to 1000.
|
|
36
|
+
access_token: the oauth access_token used to authenticate.
|
|
37
|
+
start_date: The start time of the range for which to load. Defaults to January 1st 2000.
|
|
38
|
+
end_date: The end time of the range for which to load data.
|
|
39
|
+
selected_channels: The list of channels to load. If None, all channels will be loaded.
|
|
40
|
+
table_per_channel: Boolean flag, True by default. If True - for each channel separate table with messages is created.
|
|
41
|
+
Otherwise, all messages are put in one table.
|
|
42
|
+
replies: Boolean flag indicating if you want a replies table to be present as well. False by default.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Iterable[DltResource]: A list of DltResource objects representing the data resources.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
end_dt: Optional[DateTime] = ensure_dt_type(end_date)
|
|
49
|
+
start_dt: Optional[DateTime] = ensure_dt_type(start_date)
|
|
50
|
+
write_disposition: Literal["append", "merge"] = (
|
|
51
|
+
"append" if end_date is None else "merge"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
api = SlackAPI(
|
|
55
|
+
access_token=access_token,
|
|
56
|
+
page_size=page_size,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def get_channels(
|
|
60
|
+
slack_api: SlackAPI, selected_channels: Optional[List[str]]
|
|
61
|
+
) -> Tuple[List[TDataItem], List[TDataItem]]:
|
|
62
|
+
"""
|
|
63
|
+
Returns channel fetched from slack and list of selected channels.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
slack_api: Slack API instance.
|
|
67
|
+
selected_channels: List of selected channels names or None.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Tuple[List[TDataItem], List[TDataItem]]: fetched channels and selected fetched channels.
|
|
71
|
+
"""
|
|
72
|
+
channels: List[TDataItem] = []
|
|
73
|
+
for page_data in slack_api.get_pages(
|
|
74
|
+
resource="conversations.list",
|
|
75
|
+
response_path="$.channels[*]",
|
|
76
|
+
datetime_fields=DEFAULT_DATETIME_FIELDS,
|
|
77
|
+
):
|
|
78
|
+
channels.extend(page_data)
|
|
79
|
+
|
|
80
|
+
if selected_channels:
|
|
81
|
+
fetch_channels = [
|
|
82
|
+
c
|
|
83
|
+
for c in channels
|
|
84
|
+
if c["name"] in selected_channels or c["id"] in selected_channels
|
|
85
|
+
]
|
|
86
|
+
else:
|
|
87
|
+
fetch_channels = channels
|
|
88
|
+
return channels, fetch_channels
|
|
89
|
+
|
|
90
|
+
channels, fetched_selected_channels = get_channels(api, selected_channels)
|
|
91
|
+
|
|
92
|
+
@dlt.resource(name="channels", primary_key="id", write_disposition="replace")
|
|
93
|
+
def channels_resource() -> Iterable[TDataItem]:
|
|
94
|
+
"""Yield all channels as a DLT resource."""
|
|
95
|
+
yield from channels
|
|
96
|
+
|
|
97
|
+
@dlt.resource(name="users", primary_key="id", write_disposition="replace")
|
|
98
|
+
def users_resource() -> Iterable[TDataItem]:
|
|
99
|
+
"""
|
|
100
|
+
Yield all users as a DLT resource.
|
|
101
|
+
|
|
102
|
+
Yields:
|
|
103
|
+
Iterable[TDataItem]: A list of users.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
for page_data in api.get_pages(
|
|
107
|
+
resource="users.list",
|
|
108
|
+
response_path="$.members[*]",
|
|
109
|
+
params=dict(include_locale=True),
|
|
110
|
+
datetime_fields=DEFAULT_DATETIME_FIELDS,
|
|
111
|
+
):
|
|
112
|
+
yield page_data
|
|
113
|
+
|
|
114
|
+
def get_messages(
|
|
115
|
+
channel_data: Dict[str, Any], start_date_ts: float, end_date_ts: float
|
|
116
|
+
) -> Iterable[TDataItem]:
|
|
117
|
+
"""
|
|
118
|
+
Generator, which gets channel messages for specific dates.
|
|
119
|
+
Args:
|
|
120
|
+
channel_data: dict with channels data.
|
|
121
|
+
start_date_ts: start timestamp.
|
|
122
|
+
end_date_ts: end timestamp.
|
|
123
|
+
|
|
124
|
+
Yields:
|
|
125
|
+
List[TDataItem]: messages.
|
|
126
|
+
"""
|
|
127
|
+
params = {
|
|
128
|
+
"channel": channel_data["id"],
|
|
129
|
+
"oldest": start_date_ts,
|
|
130
|
+
"latest": end_date_ts,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
for page_data in api.get_pages(
|
|
134
|
+
resource="conversations.history",
|
|
135
|
+
response_path="$.messages[*]",
|
|
136
|
+
params=params,
|
|
137
|
+
datetime_fields=MSG_DATETIME_FIELDS,
|
|
138
|
+
context={"channel": channel_data["id"]},
|
|
139
|
+
):
|
|
140
|
+
yield page_data
|
|
141
|
+
|
|
142
|
+
def get_thread_replies(messages: List[Dict[str, Any]]) -> Iterable[TDataItem]:
|
|
143
|
+
"""
|
|
144
|
+
Generator, which gets replies for each message.
|
|
145
|
+
Args:
|
|
146
|
+
messages: messages data.
|
|
147
|
+
|
|
148
|
+
Yields:
|
|
149
|
+
Li
|
|
150
|
+
"""
|
|
151
|
+
for message in messages:
|
|
152
|
+
if message.get("thread_ts", None):
|
|
153
|
+
params = {
|
|
154
|
+
"channel": message["channel"],
|
|
155
|
+
"ts": ensure_dt_type(message["thread_ts"], to_ts=True),
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
for page_data in api.get_pages(
|
|
159
|
+
resource="conversations.replies",
|
|
160
|
+
response_path="$.messages[*]",
|
|
161
|
+
params=params,
|
|
162
|
+
context={"channel": message["channel"]},
|
|
163
|
+
):
|
|
164
|
+
yield page_data[1:]
|
|
165
|
+
|
|
166
|
+
@dlt.resource(
|
|
167
|
+
name="messages",
|
|
168
|
+
primary_key=("channel", "ts"),
|
|
169
|
+
columns={"blocks": {"data_type": "complex"}},
|
|
170
|
+
write_disposition=write_disposition,
|
|
171
|
+
)
|
|
172
|
+
def messages_resource(
|
|
173
|
+
created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
|
|
174
|
+
"ts",
|
|
175
|
+
initial_value=start_dt,
|
|
176
|
+
end_value=end_dt,
|
|
177
|
+
allow_external_schedulers=True,
|
|
178
|
+
),
|
|
179
|
+
) -> Iterable[TDataItem]:
|
|
180
|
+
"""
|
|
181
|
+
Yield all messages for a set of selected channels as a DLT resource. Keep blocks column without normalization.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
created_at (dlt.sources.incremental[DateTime]): The incremental created_at field.
|
|
185
|
+
|
|
186
|
+
Yields:
|
|
187
|
+
Iterable[TDataItem]: A list of messages.
|
|
188
|
+
"""
|
|
189
|
+
start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
|
|
190
|
+
end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
|
|
191
|
+
for channel_data in fetched_selected_channels:
|
|
192
|
+
yield from get_messages(channel_data, start_date_ts, end_date_ts)
|
|
193
|
+
|
|
194
|
+
def per_table_messages_resource(
|
|
195
|
+
channel_data: Dict[str, Any],
|
|
196
|
+
created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental(
|
|
197
|
+
"ts",
|
|
198
|
+
initial_value=start_dt,
|
|
199
|
+
end_value=end_dt,
|
|
200
|
+
allow_external_schedulers=True,
|
|
201
|
+
),
|
|
202
|
+
) -> Iterable[TDataItem]:
|
|
203
|
+
"""Yield all messages for a given channel as a DLT resource. Keep blocks column without normalization.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
channel_data (Dict[str, Any]): The channel data.
|
|
207
|
+
created_at (dlt.sources.incremental[DateTime]): The incremental created_at field.
|
|
208
|
+
|
|
209
|
+
Yields:
|
|
210
|
+
Iterable[TDataItem]: A list of messages.
|
|
211
|
+
"""
|
|
212
|
+
start_date_ts = ensure_dt_type(created_at.last_value, to_ts=True)
|
|
213
|
+
end_date_ts = ensure_dt_type(created_at.end_value, to_ts=True)
|
|
214
|
+
yield from get_messages(channel_data, start_date_ts, end_date_ts)
|
|
215
|
+
|
|
216
|
+
def table_name_func(channel_name: str, payload: TDataItem) -> str:
|
|
217
|
+
"""Return the table name for a given channel and payload."""
|
|
218
|
+
table_type = payload.get("subtype", payload.get("type", ""))
|
|
219
|
+
return f"{channel_name}_{table_type}"
|
|
220
|
+
|
|
221
|
+
# It will not work in the pipeline or tests because it is a paid feature,
|
|
222
|
+
# raise an error when it is not a paying account.
|
|
223
|
+
@dlt.resource(
|
|
224
|
+
name="access_logs",
|
|
225
|
+
selected=False,
|
|
226
|
+
primary_key="user_id",
|
|
227
|
+
write_disposition="append",
|
|
228
|
+
)
|
|
229
|
+
# it is not an incremental resource it just has an end_date filter
|
|
230
|
+
def logs_resource() -> Iterable[TDataItem]:
|
|
231
|
+
"""The access logs resource."""
|
|
232
|
+
for page_data in api.get_pages(
|
|
233
|
+
resource="team.accessLogs",
|
|
234
|
+
response_path="$.logins[*]",
|
|
235
|
+
datetime_fields=["date_first", "date_last"],
|
|
236
|
+
params={"before": end_dt if end_dt is None else end_dt.int_timestamp},
|
|
237
|
+
):
|
|
238
|
+
yield page_data
|
|
239
|
+
|
|
240
|
+
yield from (channels_resource, users_resource, logs_resource)
|
|
241
|
+
|
|
242
|
+
if table_per_channel:
|
|
243
|
+
for channel in fetched_selected_channels:
|
|
244
|
+
channel_name = channel["name"]
|
|
245
|
+
table_name = partial(table_name_func, channel_name)
|
|
246
|
+
messages_channel = dlt.resource(
|
|
247
|
+
per_table_messages_resource,
|
|
248
|
+
name=channel_name,
|
|
249
|
+
table_name=table_name,
|
|
250
|
+
primary_key=("channel", "ts"),
|
|
251
|
+
write_disposition=write_disposition,
|
|
252
|
+
columns={"blocks": {"data_type": "complex"}},
|
|
253
|
+
)(channel)
|
|
254
|
+
|
|
255
|
+
yield messages_channel
|
|
256
|
+
if replies:
|
|
257
|
+
yield messages_channel | dlt.transformer(
|
|
258
|
+
get_thread_replies,
|
|
259
|
+
name=channel_name + "_replies",
|
|
260
|
+
table_name=partial(table_name_func, channel_name + "_replies"),
|
|
261
|
+
primary_key=("thread_ts", "ts"),
|
|
262
|
+
write_disposition=write_disposition,
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
yield messages_resource
|
|
266
|
+
if replies:
|
|
267
|
+
yield messages_resource | dlt.transformer(
|
|
268
|
+
get_thread_replies,
|
|
269
|
+
name="replies",
|
|
270
|
+
primary_key=("thread_ts", "ts"),
|
|
271
|
+
write_disposition=write_disposition,
|
|
272
|
+
)
|