ingestr 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/main.py +10 -0
- ingestr/src/.gitignore +10 -0
- ingestr/src/adjust/_init_.py +31 -0
- ingestr/src/adjust/helpers.py +82 -0
- ingestr/src/appsflyer/_init_.py +24 -0
- ingestr/src/appsflyer/client.py +106 -0
- ingestr/src/facebook_ads/__init__.py +197 -0
- ingestr/src/facebook_ads/exceptions.py +5 -0
- ingestr/src/facebook_ads/helpers.py +255 -0
- ingestr/src/facebook_ads/settings.py +208 -0
- ingestr/src/factory.py +15 -0
- ingestr/src/kafka/__init__.py +103 -0
- ingestr/src/kafka/helpers.py +227 -0
- ingestr/src/klaviyo/_init_.py +173 -0
- ingestr/src/klaviyo/client.py +212 -0
- ingestr/src/klaviyo/helpers.py +19 -0
- ingestr/src/shopify/__init__.py +1752 -54
- ingestr/src/shopify/helpers.py +73 -32
- ingestr/src/sources.py +230 -7
- ingestr/src/version.py +1 -1
- {ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/METADATA +22 -1
- {ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/RECORD +25 -11
- {ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/WHEEL +0 -0
- {ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/entry_points.txt +0 -0
- {ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""Facebook ads source helpers"""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import itertools
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Iterator, Sequence
|
|
7
|
+
|
|
8
|
+
import dlt
|
|
9
|
+
import humanize
|
|
10
|
+
import pendulum
|
|
11
|
+
from dlt.common import logger
|
|
12
|
+
from dlt.common.configuration.inject import with_config
|
|
13
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
14
|
+
from dlt.common.typing import DictStrAny, TDataItem, TDataItems
|
|
15
|
+
from dlt.sources.helpers import requests
|
|
16
|
+
from dlt.sources.helpers.requests import Client
|
|
17
|
+
from facebook_business import FacebookAdsApi
|
|
18
|
+
from facebook_business.adobjects.abstractcrudobject import AbstractCrudObject
|
|
19
|
+
from facebook_business.adobjects.abstractobject import AbstractObject
|
|
20
|
+
from facebook_business.adobjects.adaccount import AdAccount
|
|
21
|
+
from facebook_business.adobjects.user import User
|
|
22
|
+
from facebook_business.api import FacebookResponse
|
|
23
|
+
|
|
24
|
+
from .exceptions import InsightsJobTimeout
|
|
25
|
+
from .settings import (
|
|
26
|
+
FACEBOOK_INSIGHTS_RETENTION_PERIOD,
|
|
27
|
+
INSIGHTS_PRIMARY_KEY,
|
|
28
|
+
TFbMethod,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_start_date(
|
|
33
|
+
incremental_start_date: dlt.sources.incremental[str],
|
|
34
|
+
attribution_window_days_lag: int = 7,
|
|
35
|
+
) -> pendulum.DateTime:
|
|
36
|
+
"""
|
|
37
|
+
Get the start date for incremental loading of Facebook Insights data.
|
|
38
|
+
"""
|
|
39
|
+
start_date: pendulum.DateTime = ensure_pendulum_datetime(
|
|
40
|
+
incremental_start_date.start_value
|
|
41
|
+
).subtract(days=attribution_window_days_lag)
|
|
42
|
+
|
|
43
|
+
# facebook forgets insights so trim the lag and warn
|
|
44
|
+
min_start_date = pendulum.today().subtract(
|
|
45
|
+
months=FACEBOOK_INSIGHTS_RETENTION_PERIOD
|
|
46
|
+
)
|
|
47
|
+
if start_date < min_start_date:
|
|
48
|
+
logger.warning(
|
|
49
|
+
"%s: Start date is earlier than %s months ago, using %s instead. "
|
|
50
|
+
"For more information, see https://www.facebook.com/business/help/1695754927158071?id=354406972049255",
|
|
51
|
+
"facebook_insights",
|
|
52
|
+
FACEBOOK_INSIGHTS_RETENTION_PERIOD,
|
|
53
|
+
min_start_date,
|
|
54
|
+
)
|
|
55
|
+
start_date = min_start_date
|
|
56
|
+
incremental_start_date.start_value = min_start_date
|
|
57
|
+
|
|
58
|
+
# lag the incremental start date by attribution window lag
|
|
59
|
+
incremental_start_date.start_value = start_date.isoformat()
|
|
60
|
+
return start_date
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def process_report_item(item: AbstractObject) -> DictStrAny:
|
|
64
|
+
d: DictStrAny = item.export_all_data()
|
|
65
|
+
for pki in INSIGHTS_PRIMARY_KEY:
|
|
66
|
+
if pki not in d:
|
|
67
|
+
d[pki] = "no_" + pki
|
|
68
|
+
|
|
69
|
+
return d
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_data_chunked(
|
|
73
|
+
method: TFbMethod, fields: Sequence[str], states: Sequence[str], chunk_size: int
|
|
74
|
+
) -> Iterator[TDataItems]:
|
|
75
|
+
# add pagination and chunk into lists
|
|
76
|
+
params: DictStrAny = {"limit": chunk_size}
|
|
77
|
+
if states:
|
|
78
|
+
params.update({"effective_status": states})
|
|
79
|
+
it: map[DictStrAny] = map(
|
|
80
|
+
lambda c: c.export_all_data(), method(fields=fields, params=params)
|
|
81
|
+
)
|
|
82
|
+
while True:
|
|
83
|
+
chunk = list(itertools.islice(it, chunk_size))
|
|
84
|
+
if not chunk:
|
|
85
|
+
break
|
|
86
|
+
yield chunk
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def enrich_ad_objects(fb_obj_type: AbstractObject, fields: Sequence[str]) -> Any:
|
|
90
|
+
"""Returns a transformation that will enrich any of the resources returned by `` with additional fields
|
|
91
|
+
|
|
92
|
+
In example below we add "thumbnail_url" to all objects loaded by `ad_creatives` resource:
|
|
93
|
+
>>> fb_ads = facebook_ads_source()
|
|
94
|
+
>>> fb_ads.ad_creatives.add_step(enrich_ad_objects(AdCreative, ["thumbnail_url"]))
|
|
95
|
+
|
|
96
|
+
Internally, the method uses batch API to get data efficiently. Refer to demo script for full examples
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
fb_obj_type (AbstractObject): A Facebook Business object type (Ad, Campaign, AdSet, AdCreative, Lead). Import those types from this module
|
|
100
|
+
fields (Sequence[str]): A list/tuple of fields to add to each object.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
ItemTransformFunctionWithMeta[TDataItems]: A transformation function to be added to a resource with `add_step` method
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def _wrap(items: TDataItems, meta: Any = None) -> TDataItems:
|
|
107
|
+
api_batch = FacebookAdsApi.get_default_api().new_batch()
|
|
108
|
+
|
|
109
|
+
def update_item(resp: FacebookResponse, item: TDataItem) -> None:
|
|
110
|
+
item.update(resp.json())
|
|
111
|
+
|
|
112
|
+
def fail(resp: FacebookResponse) -> None:
|
|
113
|
+
raise resp.error()
|
|
114
|
+
|
|
115
|
+
for item in items:
|
|
116
|
+
o: AbstractCrudObject = fb_obj_type(item["id"])
|
|
117
|
+
o.api_get(
|
|
118
|
+
fields=fields,
|
|
119
|
+
batch=api_batch,
|
|
120
|
+
success=functools.partial(update_item, item=item),
|
|
121
|
+
failure=fail,
|
|
122
|
+
)
|
|
123
|
+
api_batch.execute()
|
|
124
|
+
return items
|
|
125
|
+
|
|
126
|
+
return _wrap
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
JOB_TIMEOUT_INFO = """This is an intermittent error and may resolve itself on subsequent queries to the Facebook API.
|
|
130
|
+
You should remove the fields in `fields` argument that are not necessary, as that may help improve the reliability of the Facebook API."""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def execute_job(
|
|
134
|
+
job: AbstractCrudObject,
|
|
135
|
+
insights_max_wait_to_start_seconds: int = 5 * 60,
|
|
136
|
+
insights_max_wait_to_finish_seconds: int = 30 * 60,
|
|
137
|
+
insights_max_async_sleep_seconds: int = 5 * 60,
|
|
138
|
+
) -> AbstractCrudObject:
|
|
139
|
+
status: str = None
|
|
140
|
+
time_start = time.time()
|
|
141
|
+
sleep_time = 10
|
|
142
|
+
while status != "Job Completed":
|
|
143
|
+
duration = time.time() - time_start
|
|
144
|
+
job = job.api_get()
|
|
145
|
+
status = job["async_status"]
|
|
146
|
+
percent_complete = job["async_percent_completion"]
|
|
147
|
+
|
|
148
|
+
job_id = job["id"]
|
|
149
|
+
logger.info("%s, %d%% done", status, percent_complete)
|
|
150
|
+
|
|
151
|
+
if status == "Job Completed":
|
|
152
|
+
return job
|
|
153
|
+
|
|
154
|
+
if duration > insights_max_wait_to_start_seconds and percent_complete == 0:
|
|
155
|
+
pretty_error_message = (
|
|
156
|
+
"Insights job {} did not start after {} seconds. " + JOB_TIMEOUT_INFO
|
|
157
|
+
)
|
|
158
|
+
raise InsightsJobTimeout(
|
|
159
|
+
"facebook_insights",
|
|
160
|
+
pretty_error_message.format(job_id, insights_max_wait_to_start_seconds),
|
|
161
|
+
)
|
|
162
|
+
elif (
|
|
163
|
+
duration > insights_max_wait_to_finish_seconds and status != "Job Completed"
|
|
164
|
+
):
|
|
165
|
+
pretty_error_message = (
|
|
166
|
+
"Insights job {} did not complete after {} seconds. " + JOB_TIMEOUT_INFO
|
|
167
|
+
)
|
|
168
|
+
raise InsightsJobTimeout(
|
|
169
|
+
"facebook_insights",
|
|
170
|
+
pretty_error_message.format(
|
|
171
|
+
job_id, insights_max_wait_to_finish_seconds // 60
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
logger.info("sleeping for %d seconds until job is done", sleep_time)
|
|
176
|
+
time.sleep(sleep_time)
|
|
177
|
+
if sleep_time < insights_max_async_sleep_seconds:
|
|
178
|
+
sleep_time = 2 * sleep_time
|
|
179
|
+
return job
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_ads_account(
|
|
183
|
+
account_id: str, access_token: str, request_timeout: float, app_api_version: str
|
|
184
|
+
) -> AdAccount:
|
|
185
|
+
notify_on_token_expiration()
|
|
186
|
+
|
|
187
|
+
def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
|
|
188
|
+
try:
|
|
189
|
+
error = response.json()["error"]
|
|
190
|
+
code = error["code"]
|
|
191
|
+
message = error["message"]
|
|
192
|
+
should_retry = code in (
|
|
193
|
+
1,
|
|
194
|
+
2,
|
|
195
|
+
4,
|
|
196
|
+
17,
|
|
197
|
+
341,
|
|
198
|
+
32,
|
|
199
|
+
613,
|
|
200
|
+
*range(80000, 80007),
|
|
201
|
+
800008,
|
|
202
|
+
800009,
|
|
203
|
+
80014,
|
|
204
|
+
)
|
|
205
|
+
if should_retry:
|
|
206
|
+
logger.warning(
|
|
207
|
+
"facebook_ads source will retry due to %s with error code %i"
|
|
208
|
+
% (message, code)
|
|
209
|
+
)
|
|
210
|
+
return should_retry
|
|
211
|
+
except Exception:
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
retry_session = Client(
|
|
215
|
+
request_timeout=request_timeout,
|
|
216
|
+
raise_for_status=False,
|
|
217
|
+
retry_condition=retry_on_limit,
|
|
218
|
+
request_max_attempts=12,
|
|
219
|
+
request_backoff_factor=2,
|
|
220
|
+
).session
|
|
221
|
+
retry_session.params.update({"access_token": access_token}) # type: ignore
|
|
222
|
+
# patch dlt requests session with retries
|
|
223
|
+
API = FacebookAdsApi.init(
|
|
224
|
+
account_id="act_" + account_id,
|
|
225
|
+
access_token=access_token,
|
|
226
|
+
api_version=app_api_version,
|
|
227
|
+
)
|
|
228
|
+
API._session.requests = retry_session
|
|
229
|
+
user = User(fbid="me")
|
|
230
|
+
|
|
231
|
+
accounts = user.get_ad_accounts()
|
|
232
|
+
account: AdAccount = None
|
|
233
|
+
for acc in accounts:
|
|
234
|
+
if acc["account_id"] == account_id:
|
|
235
|
+
account = acc
|
|
236
|
+
|
|
237
|
+
if not account:
|
|
238
|
+
raise ValueError("Couldn't find account with id {}".format(account_id))
|
|
239
|
+
|
|
240
|
+
return account
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
@with_config(sections=("sources", "facebook_ads"))
|
|
244
|
+
def notify_on_token_expiration(access_token_expires_at: int = None) -> None:
|
|
245
|
+
"""Notifies (currently via logger) if access token expires in less than 7 days. Needs `access_token_expires_at` to be configured."""
|
|
246
|
+
if not access_token_expires_at:
|
|
247
|
+
logger.warning(
|
|
248
|
+
"Token expiration time notification disabled. Configure token expiration timestamp in access_token_expires_at config value"
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
expires_at = pendulum.from_timestamp(access_token_expires_at)
|
|
252
|
+
if expires_at < pendulum.now().add(days=7):
|
|
253
|
+
logger.error(
|
|
254
|
+
f"Access Token expires in {humanize.precisedelta(pendulum.now() - expires_at)}. Replace the token now!"
|
|
255
|
+
)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Facebook ads source settings and constants"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Dict, Iterator, Literal
|
|
4
|
+
|
|
5
|
+
from dlt.common.schema.typing import TTableSchemaColumns
|
|
6
|
+
from facebook_business.adobjects.abstractobject import AbstractObject
|
|
7
|
+
|
|
8
|
+
TFbMethod = Callable[..., Iterator[AbstractObject]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_FIELDS = (
|
|
12
|
+
"id",
|
|
13
|
+
"updated_time",
|
|
14
|
+
"created_time",
|
|
15
|
+
"name",
|
|
16
|
+
"status",
|
|
17
|
+
"effective_status",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
DEFAULT_CAMPAIGN_FIELDS = DEFAULT_FIELDS + (
|
|
21
|
+
"objective",
|
|
22
|
+
"start_time",
|
|
23
|
+
"stop_time",
|
|
24
|
+
"daily_budget",
|
|
25
|
+
"lifetime_budget",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
DEFAULT_AD_FIELDS = DEFAULT_FIELDS + (
|
|
29
|
+
"adset_id",
|
|
30
|
+
"campaign_id",
|
|
31
|
+
"creative",
|
|
32
|
+
"targeting",
|
|
33
|
+
"tracking_specs",
|
|
34
|
+
"conversion_specs",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
DEFAULT_ADSET_FIELDS = DEFAULT_FIELDS + (
|
|
38
|
+
"campaign_id",
|
|
39
|
+
"start_time",
|
|
40
|
+
"end_time",
|
|
41
|
+
"daily_budget",
|
|
42
|
+
"lifetime_budget",
|
|
43
|
+
"optimization_goal",
|
|
44
|
+
"promoted_object",
|
|
45
|
+
"billing_event",
|
|
46
|
+
"bid_amount",
|
|
47
|
+
"bid_strategy",
|
|
48
|
+
"targeting",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
DEFAULT_ADCREATIVE_FIELDS = (
|
|
52
|
+
"id",
|
|
53
|
+
"name",
|
|
54
|
+
"status",
|
|
55
|
+
"thumbnail_url",
|
|
56
|
+
"object_story_spec",
|
|
57
|
+
"effective_object_story_id",
|
|
58
|
+
"call_to_action_type",
|
|
59
|
+
"object_type",
|
|
60
|
+
"template_url",
|
|
61
|
+
"url_tags",
|
|
62
|
+
"instagram_actor_id",
|
|
63
|
+
"product_set_id",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
DEFAULT_LEAD_FIELDS = (
|
|
67
|
+
"id",
|
|
68
|
+
"created_time",
|
|
69
|
+
"ad_id",
|
|
70
|
+
"ad_name",
|
|
71
|
+
"adset_id",
|
|
72
|
+
"adset_name",
|
|
73
|
+
"campaign_id",
|
|
74
|
+
"campaign_name",
|
|
75
|
+
"form_id",
|
|
76
|
+
"field_data",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
DEFAULT_INSIGHT_FIELDS = (
|
|
80
|
+
"campaign_id",
|
|
81
|
+
"adset_id",
|
|
82
|
+
"ad_id",
|
|
83
|
+
"date_start",
|
|
84
|
+
"date_stop",
|
|
85
|
+
"reach",
|
|
86
|
+
"impressions",
|
|
87
|
+
"frequency",
|
|
88
|
+
"clicks",
|
|
89
|
+
"unique_clicks",
|
|
90
|
+
"ctr",
|
|
91
|
+
"unique_ctr",
|
|
92
|
+
"cpc",
|
|
93
|
+
"cpm",
|
|
94
|
+
"cpp",
|
|
95
|
+
"spend",
|
|
96
|
+
"actions",
|
|
97
|
+
"action_values",
|
|
98
|
+
"cost_per_action_type",
|
|
99
|
+
"website_ctr",
|
|
100
|
+
"account_currency",
|
|
101
|
+
"ad_click_actions",
|
|
102
|
+
"ad_name",
|
|
103
|
+
"adset_name",
|
|
104
|
+
"campaign_name",
|
|
105
|
+
"country",
|
|
106
|
+
"dma",
|
|
107
|
+
"full_view_impressions",
|
|
108
|
+
"full_view_reach",
|
|
109
|
+
"inline_link_click_ctr",
|
|
110
|
+
"outbound_clicks",
|
|
111
|
+
"reach",
|
|
112
|
+
"social_spend",
|
|
113
|
+
"spend",
|
|
114
|
+
"website_ctr",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
TInsightsLevels = Literal["account", "campaign", "adset", "ad"]
|
|
118
|
+
|
|
119
|
+
INSIGHTS_PRIMARY_KEY = ("campaign_id", "adset_id", "ad_id", "date_start")
|
|
120
|
+
|
|
121
|
+
ALL_STATES = {
|
|
122
|
+
"effective_status": [
|
|
123
|
+
"ACTIVE",
|
|
124
|
+
"PAUSED",
|
|
125
|
+
"DELETED",
|
|
126
|
+
"PENDING_REVIEW",
|
|
127
|
+
"DISAPPROVED",
|
|
128
|
+
"PREAPPROVED",
|
|
129
|
+
"PENDING_BILLING_INFO",
|
|
130
|
+
"CAMPAIGN_PAUSED",
|
|
131
|
+
"ARCHIVED",
|
|
132
|
+
"ADSET_PAUSED",
|
|
133
|
+
]
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
TInsightsBreakdownOptions = Literal[
|
|
137
|
+
"ads_insights",
|
|
138
|
+
"ads_insights_age_and_gender",
|
|
139
|
+
"ads_insights_country",
|
|
140
|
+
"ads_insights_platform_and_device",
|
|
141
|
+
"ads_insights_region",
|
|
142
|
+
"ads_insights_dma",
|
|
143
|
+
"ads_insights_hourly_advertiser",
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
ALL_ACTION_ATTRIBUTION_WINDOWS = (
|
|
147
|
+
"1d_click",
|
|
148
|
+
"7d_click",
|
|
149
|
+
"28d_click",
|
|
150
|
+
"1d_view",
|
|
151
|
+
"7d_view",
|
|
152
|
+
"28d_view",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
ALL_ACTION_BREAKDOWNS = ("action_type", "action_target_id", "action_destination")
|
|
156
|
+
|
|
157
|
+
INSIGHTS_BREAKDOWNS_OPTIONS: Dict[TInsightsBreakdownOptions, Any] = {
|
|
158
|
+
"ads_insights": {"breakdowns": (), "fields": ()},
|
|
159
|
+
"ads_insights_age_and_gender": {
|
|
160
|
+
"breakdowns": ("age", "gender"),
|
|
161
|
+
"fields": ("age", "gender"),
|
|
162
|
+
},
|
|
163
|
+
"ads_insights_country": {"breakdowns": ("country",), "fields": ("country",)},
|
|
164
|
+
"ads_insights_platform_and_device": {
|
|
165
|
+
"breakdowns": ("publisher_platform", "platform_position", "impression_device"),
|
|
166
|
+
"fields": ("publisher_platform", "platform_position", "impression_device"),
|
|
167
|
+
},
|
|
168
|
+
"ads_insights_region": {"breakdowns": ("region",), "fields": ("region",)},
|
|
169
|
+
"ads_insights_dma": {"breakdowns": ("dma",), "fields": ("dma",)},
|
|
170
|
+
"ads_insights_hourly_advertiser": {
|
|
171
|
+
"breakdowns": ("hourly_stats_aggregated_by_advertiser_time_zone",),
|
|
172
|
+
"fields": ("hourly_stats_aggregated_by_advertiser_time_zone",),
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
INSIGHT_FIELDS_TYPES: TTableSchemaColumns = {
|
|
177
|
+
"campaign_id": {"data_type": "bigint"},
|
|
178
|
+
"adset_id": {"data_type": "bigint"},
|
|
179
|
+
"ad_id": {"data_type": "bigint"},
|
|
180
|
+
"date_start": {"data_type": "timestamp"},
|
|
181
|
+
"date_stop": {"data_type": "timestamp"},
|
|
182
|
+
"reach": {"data_type": "bigint"},
|
|
183
|
+
"impressions": {"data_type": "bigint"},
|
|
184
|
+
"frequency": {"data_type": "decimal"},
|
|
185
|
+
"clicks": {"data_type": "bigint"},
|
|
186
|
+
"unique_clicks": {"data_type": "bigint"},
|
|
187
|
+
"ctr": {"data_type": "decimal"},
|
|
188
|
+
"unique_ctr": {"data_type": "decimal"},
|
|
189
|
+
"cpc": {"data_type": "decimal"},
|
|
190
|
+
"cpm": {"data_type": "decimal"},
|
|
191
|
+
"cpp": {"data_type": "decimal"},
|
|
192
|
+
"spend": {"data_type": "decimal"},
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
INVALID_INSIGHTS_FIELDS = [
|
|
196
|
+
"impression_device",
|
|
197
|
+
"publisher_platform",
|
|
198
|
+
"platform_position",
|
|
199
|
+
"age",
|
|
200
|
+
"gender",
|
|
201
|
+
"country",
|
|
202
|
+
"placement",
|
|
203
|
+
"region",
|
|
204
|
+
"dma",
|
|
205
|
+
"hourly_stats_aggregated_by_advertiser_time_zone",
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
FACEBOOK_INSIGHTS_RETENTION_PERIOD = 37 # months
|
ingestr/src/factory.py
CHANGED
|
@@ -15,11 +15,16 @@ from ingestr.src.destinations import (
|
|
|
15
15
|
SynapseDestination,
|
|
16
16
|
)
|
|
17
17
|
from ingestr.src.sources import (
|
|
18
|
+
AdjustSource,
|
|
18
19
|
AirtableSource,
|
|
20
|
+
AppsflyerSource,
|
|
19
21
|
ChessSource,
|
|
22
|
+
FacebookAdsSource,
|
|
20
23
|
GoogleSheetsSource,
|
|
21
24
|
GorgiasSource,
|
|
22
25
|
HubspotSource,
|
|
26
|
+
KafkaSource,
|
|
27
|
+
KlaviyoSource,
|
|
23
28
|
LocalCsvSource,
|
|
24
29
|
MongoDbSource,
|
|
25
30
|
NotionSource,
|
|
@@ -111,12 +116,22 @@ class SourceDestinationFactory:
|
|
|
111
116
|
return ChessSource()
|
|
112
117
|
elif self.source_scheme == "stripe":
|
|
113
118
|
return StripeAnalyticsSource()
|
|
119
|
+
elif self.source_scheme == "facebookads":
|
|
120
|
+
return FacebookAdsSource()
|
|
114
121
|
elif self.source_scheme == "slack":
|
|
115
122
|
return SlackSource()
|
|
116
123
|
elif self.source_scheme == "hubspot":
|
|
117
124
|
return HubspotSource()
|
|
118
125
|
elif self.source_scheme == "airtable":
|
|
119
126
|
return AirtableSource()
|
|
127
|
+
elif self.source_scheme == "klaviyo":
|
|
128
|
+
return KlaviyoSource()
|
|
129
|
+
elif self.source_scheme == "appsflyer":
|
|
130
|
+
return AppsflyerSource()
|
|
131
|
+
elif self.source_scheme == "kafka":
|
|
132
|
+
return KafkaSource()
|
|
133
|
+
elif self.source_scheme == "adjust":
|
|
134
|
+
return AdjustSource()
|
|
120
135
|
else:
|
|
121
136
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
122
137
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""A source to extract Kafka messages.
|
|
2
|
+
|
|
3
|
+
When extraction starts, partitions length is checked -
|
|
4
|
+
data is read only up to it, overriding the default Kafka's
|
|
5
|
+
behavior of waiting for new messages in endless loop.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from contextlib import closing
|
|
9
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import dlt
|
|
12
|
+
from confluent_kafka import Consumer, Message # type: ignore
|
|
13
|
+
from dlt.common import logger
|
|
14
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
15
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
16
|
+
|
|
17
|
+
from .helpers import (
|
|
18
|
+
KafkaCredentials,
|
|
19
|
+
OffsetTracker,
|
|
20
|
+
default_msg_processor,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dlt.resource(
|
|
25
|
+
name="kafka_messages",
|
|
26
|
+
table_name=lambda msg: msg["_kafka"]["topic"],
|
|
27
|
+
standalone=True,
|
|
28
|
+
)
|
|
29
|
+
def kafka_consumer(
|
|
30
|
+
topics: Union[str, List[str]],
|
|
31
|
+
credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value,
|
|
32
|
+
msg_processor: Optional[
|
|
33
|
+
Callable[[Message], Dict[str, Any]]
|
|
34
|
+
] = default_msg_processor,
|
|
35
|
+
batch_size: Optional[int] = 3000,
|
|
36
|
+
batch_timeout: Optional[int] = 3,
|
|
37
|
+
start_from: Optional[TAnyDateTime] = None,
|
|
38
|
+
) -> Iterable[TDataItem]:
|
|
39
|
+
"""Extract recent messages from the given Kafka topics.
|
|
40
|
+
|
|
41
|
+
The resource tracks offsets for all the topics and partitions,
|
|
42
|
+
and so reads data incrementally.
|
|
43
|
+
|
|
44
|
+
Messages from different topics are saved in different tables.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
topics (Union[str, List[str]]): Names of topics to extract.
|
|
48
|
+
credentials (Optional[Union[KafkaCredentials, Consumer]]):
|
|
49
|
+
Auth credentials or an initiated Kafka consumer. By default,
|
|
50
|
+
is taken from secrets.
|
|
51
|
+
msg_processor(Optional[Callable]): A function-converter,
|
|
52
|
+
which'll process every Kafka message after it's read and
|
|
53
|
+
before it's transfered to the destination.
|
|
54
|
+
batch_size (Optional[int]): Messages batch size to read at once.
|
|
55
|
+
batch_timeout (Optional[int]): Maximum time to wait for a batch
|
|
56
|
+
consume, in seconds.
|
|
57
|
+
start_from (Optional[TAnyDateTime]): A timestamp, at which to start
|
|
58
|
+
reading. Older messages are ignored.
|
|
59
|
+
|
|
60
|
+
Yields:
|
|
61
|
+
Iterable[TDataItem]: Kafka messages.
|
|
62
|
+
"""
|
|
63
|
+
if not isinstance(topics, list):
|
|
64
|
+
topics = [topics]
|
|
65
|
+
|
|
66
|
+
if isinstance(credentials, Consumer):
|
|
67
|
+
consumer = credentials
|
|
68
|
+
elif isinstance(credentials, KafkaCredentials):
|
|
69
|
+
consumer = credentials.init_consumer()
|
|
70
|
+
else:
|
|
71
|
+
raise TypeError(
|
|
72
|
+
(
|
|
73
|
+
"Wrong credentials type provided. Need to be of type: "
|
|
74
|
+
"KafkaCredentials or confluent_kafka.Consumer"
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if start_from is not None:
|
|
79
|
+
start_from = ensure_pendulum_datetime(start_from)
|
|
80
|
+
|
|
81
|
+
tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from) # type: ignore
|
|
82
|
+
|
|
83
|
+
# read messages up to the maximum offsets,
|
|
84
|
+
# not waiting for new messages
|
|
85
|
+
with closing(consumer):
|
|
86
|
+
while tracker.has_unread:
|
|
87
|
+
messages = consumer.consume(batch_size, timeout=batch_timeout)
|
|
88
|
+
if not messages:
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
batch = []
|
|
92
|
+
for msg in messages:
|
|
93
|
+
if msg.error():
|
|
94
|
+
err = msg.error()
|
|
95
|
+
if err.retriable() or not err.fatal():
|
|
96
|
+
logger.warning(f"ERROR: {err} - RETRYING")
|
|
97
|
+
else:
|
|
98
|
+
raise err
|
|
99
|
+
else:
|
|
100
|
+
batch.append(msg_processor(msg)) # type: ignore
|
|
101
|
+
tracker.renew(msg)
|
|
102
|
+
|
|
103
|
+
yield batch
|