ingestr 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,255 @@
1
+ """Facebook ads source helpers"""
2
+
3
+ import functools
4
+ import itertools
5
+ import time
6
+ from typing import Any, Iterator, Sequence
7
+
8
+ import dlt
9
+ import humanize
10
+ import pendulum
11
+ from dlt.common import logger
12
+ from dlt.common.configuration.inject import with_config
13
+ from dlt.common.time import ensure_pendulum_datetime
14
+ from dlt.common.typing import DictStrAny, TDataItem, TDataItems
15
+ from dlt.sources.helpers import requests
16
+ from dlt.sources.helpers.requests import Client
17
+ from facebook_business import FacebookAdsApi
18
+ from facebook_business.adobjects.abstractcrudobject import AbstractCrudObject
19
+ from facebook_business.adobjects.abstractobject import AbstractObject
20
+ from facebook_business.adobjects.adaccount import AdAccount
21
+ from facebook_business.adobjects.user import User
22
+ from facebook_business.api import FacebookResponse
23
+
24
+ from .exceptions import InsightsJobTimeout
25
+ from .settings import (
26
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD,
27
+ INSIGHTS_PRIMARY_KEY,
28
+ TFbMethod,
29
+ )
30
+
31
+
32
+ def get_start_date(
33
+ incremental_start_date: dlt.sources.incremental[str],
34
+ attribution_window_days_lag: int = 7,
35
+ ) -> pendulum.DateTime:
36
+ """
37
+ Get the start date for incremental loading of Facebook Insights data.
38
+ """
39
+ start_date: pendulum.DateTime = ensure_pendulum_datetime(
40
+ incremental_start_date.start_value
41
+ ).subtract(days=attribution_window_days_lag)
42
+
43
+ # facebook forgets insights so trim the lag and warn
44
+ min_start_date = pendulum.today().subtract(
45
+ months=FACEBOOK_INSIGHTS_RETENTION_PERIOD
46
+ )
47
+ if start_date < min_start_date:
48
+ logger.warning(
49
+ "%s: Start date is earlier than %s months ago, using %s instead. "
50
+ "For more information, see https://www.facebook.com/business/help/1695754927158071?id=354406972049255",
51
+ "facebook_insights",
52
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD,
53
+ min_start_date,
54
+ )
55
+ start_date = min_start_date
56
+ incremental_start_date.start_value = min_start_date
57
+
58
+ # lag the incremental start date by attribution window lag
59
+ incremental_start_date.start_value = start_date.isoformat()
60
+ return start_date
61
+
62
+
63
+ def process_report_item(item: AbstractObject) -> DictStrAny:
64
+ d: DictStrAny = item.export_all_data()
65
+ for pki in INSIGHTS_PRIMARY_KEY:
66
+ if pki not in d:
67
+ d[pki] = "no_" + pki
68
+
69
+ return d
70
+
71
+
72
+ def get_data_chunked(
73
+ method: TFbMethod, fields: Sequence[str], states: Sequence[str], chunk_size: int
74
+ ) -> Iterator[TDataItems]:
75
+ # add pagination and chunk into lists
76
+ params: DictStrAny = {"limit": chunk_size}
77
+ if states:
78
+ params.update({"effective_status": states})
79
+ it: map[DictStrAny] = map(
80
+ lambda c: c.export_all_data(), method(fields=fields, params=params)
81
+ )
82
+ while True:
83
+ chunk = list(itertools.islice(it, chunk_size))
84
+ if not chunk:
85
+ break
86
+ yield chunk
87
+
88
+
89
+ def enrich_ad_objects(fb_obj_type: AbstractObject, fields: Sequence[str]) -> Any:
90
+ """Returns a transformation that will enrich any of the resources returned by `` with additional fields
91
+
92
+ In example below we add "thumbnail_url" to all objects loaded by `ad_creatives` resource:
93
+ >>> fb_ads = facebook_ads_source()
94
+ >>> fb_ads.ad_creatives.add_step(enrich_ad_objects(AdCreative, ["thumbnail_url"]))
95
+
96
+ Internally, the method uses batch API to get data efficiently. Refer to demo script for full examples
97
+
98
+ Args:
99
+ fb_obj_type (AbstractObject): A Facebook Business object type (Ad, Campaign, AdSet, AdCreative, Lead). Import those types from this module
100
+ fields (Sequence[str]): A list/tuple of fields to add to each object.
101
+
102
+ Returns:
103
+ ItemTransformFunctionWithMeta[TDataItems]: A transformation function to be added to a resource with `add_step` method
104
+ """
105
+
106
+ def _wrap(items: TDataItems, meta: Any = None) -> TDataItems:
107
+ api_batch = FacebookAdsApi.get_default_api().new_batch()
108
+
109
+ def update_item(resp: FacebookResponse, item: TDataItem) -> None:
110
+ item.update(resp.json())
111
+
112
+ def fail(resp: FacebookResponse) -> None:
113
+ raise resp.error()
114
+
115
+ for item in items:
116
+ o: AbstractCrudObject = fb_obj_type(item["id"])
117
+ o.api_get(
118
+ fields=fields,
119
+ batch=api_batch,
120
+ success=functools.partial(update_item, item=item),
121
+ failure=fail,
122
+ )
123
+ api_batch.execute()
124
+ return items
125
+
126
+ return _wrap
127
+
128
+
129
+ JOB_TIMEOUT_INFO = """This is an intermittent error and may resolve itself on subsequent queries to the Facebook API.
130
+ You should remove the fields in `fields` argument that are not necessary, as that may help improve the reliability of the Facebook API."""
131
+
132
+
133
+ def execute_job(
134
+ job: AbstractCrudObject,
135
+ insights_max_wait_to_start_seconds: int = 5 * 60,
136
+ insights_max_wait_to_finish_seconds: int = 30 * 60,
137
+ insights_max_async_sleep_seconds: int = 5 * 60,
138
+ ) -> AbstractCrudObject:
139
+ status: str = None
140
+ time_start = time.time()
141
+ sleep_time = 10
142
+ while status != "Job Completed":
143
+ duration = time.time() - time_start
144
+ job = job.api_get()
145
+ status = job["async_status"]
146
+ percent_complete = job["async_percent_completion"]
147
+
148
+ job_id = job["id"]
149
+ logger.info("%s, %d%% done", status, percent_complete)
150
+
151
+ if status == "Job Completed":
152
+ return job
153
+
154
+ if duration > insights_max_wait_to_start_seconds and percent_complete == 0:
155
+ pretty_error_message = (
156
+ "Insights job {} did not start after {} seconds. " + JOB_TIMEOUT_INFO
157
+ )
158
+ raise InsightsJobTimeout(
159
+ "facebook_insights",
160
+ pretty_error_message.format(job_id, insights_max_wait_to_start_seconds),
161
+ )
162
+ elif (
163
+ duration > insights_max_wait_to_finish_seconds and status != "Job Completed"
164
+ ):
165
+ pretty_error_message = (
166
+ "Insights job {} did not complete after {} seconds. " + JOB_TIMEOUT_INFO
167
+ )
168
+ raise InsightsJobTimeout(
169
+ "facebook_insights",
170
+ pretty_error_message.format(
171
+ job_id, insights_max_wait_to_finish_seconds // 60
172
+ ),
173
+ )
174
+
175
+ logger.info("sleeping for %d seconds until job is done", sleep_time)
176
+ time.sleep(sleep_time)
177
+ if sleep_time < insights_max_async_sleep_seconds:
178
+ sleep_time = 2 * sleep_time
179
+ return job
180
+
181
+
182
+ def get_ads_account(
183
+ account_id: str, access_token: str, request_timeout: float, app_api_version: str
184
+ ) -> AdAccount:
185
+ notify_on_token_expiration()
186
+
187
+ def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
188
+ try:
189
+ error = response.json()["error"]
190
+ code = error["code"]
191
+ message = error["message"]
192
+ should_retry = code in (
193
+ 1,
194
+ 2,
195
+ 4,
196
+ 17,
197
+ 341,
198
+ 32,
199
+ 613,
200
+ *range(80000, 80007),
201
+ 800008,
202
+ 800009,
203
+ 80014,
204
+ )
205
+ if should_retry:
206
+ logger.warning(
207
+ "facebook_ads source will retry due to %s with error code %i"
208
+ % (message, code)
209
+ )
210
+ return should_retry
211
+ except Exception:
212
+ return False
213
+
214
+ retry_session = Client(
215
+ request_timeout=request_timeout,
216
+ raise_for_status=False,
217
+ retry_condition=retry_on_limit,
218
+ request_max_attempts=12,
219
+ request_backoff_factor=2,
220
+ ).session
221
+ retry_session.params.update({"access_token": access_token}) # type: ignore
222
+ # patch dlt requests session with retries
223
+ API = FacebookAdsApi.init(
224
+ account_id="act_" + account_id,
225
+ access_token=access_token,
226
+ api_version=app_api_version,
227
+ )
228
+ API._session.requests = retry_session
229
+ user = User(fbid="me")
230
+
231
+ accounts = user.get_ad_accounts()
232
+ account: AdAccount = None
233
+ for acc in accounts:
234
+ if acc["account_id"] == account_id:
235
+ account = acc
236
+
237
+ if not account:
238
+ raise ValueError("Couldn't find account with id {}".format(account_id))
239
+
240
+ return account
241
+
242
+
243
+ @with_config(sections=("sources", "facebook_ads"))
244
+ def notify_on_token_expiration(access_token_expires_at: int = None) -> None:
245
+ """Notifies (currently via logger) if access token expires in less than 7 days. Needs `access_token_expires_at` to be configured."""
246
+ if not access_token_expires_at:
247
+ logger.warning(
248
+ "Token expiration time notification disabled. Configure token expiration timestamp in access_token_expires_at config value"
249
+ )
250
+ else:
251
+ expires_at = pendulum.from_timestamp(access_token_expires_at)
252
+ if expires_at < pendulum.now().add(days=7):
253
+ logger.error(
254
+ f"Access Token expires in {humanize.precisedelta(pendulum.now() - expires_at)}. Replace the token now!"
255
+ )
@@ -0,0 +1,208 @@
1
+ """Facebook ads source settings and constants"""
2
+
3
+ from typing import Any, Callable, Dict, Iterator, Literal
4
+
5
+ from dlt.common.schema.typing import TTableSchemaColumns
6
+ from facebook_business.adobjects.abstractobject import AbstractObject
7
+
8
+ TFbMethod = Callable[..., Iterator[AbstractObject]]
9
+
10
+
11
+ DEFAULT_FIELDS = (
12
+ "id",
13
+ "updated_time",
14
+ "created_time",
15
+ "name",
16
+ "status",
17
+ "effective_status",
18
+ )
19
+
20
+ DEFAULT_CAMPAIGN_FIELDS = DEFAULT_FIELDS + (
21
+ "objective",
22
+ "start_time",
23
+ "stop_time",
24
+ "daily_budget",
25
+ "lifetime_budget",
26
+ )
27
+
28
+ DEFAULT_AD_FIELDS = DEFAULT_FIELDS + (
29
+ "adset_id",
30
+ "campaign_id",
31
+ "creative",
32
+ "targeting",
33
+ "tracking_specs",
34
+ "conversion_specs",
35
+ )
36
+
37
+ DEFAULT_ADSET_FIELDS = DEFAULT_FIELDS + (
38
+ "campaign_id",
39
+ "start_time",
40
+ "end_time",
41
+ "daily_budget",
42
+ "lifetime_budget",
43
+ "optimization_goal",
44
+ "promoted_object",
45
+ "billing_event",
46
+ "bid_amount",
47
+ "bid_strategy",
48
+ "targeting",
49
+ )
50
+
51
+ DEFAULT_ADCREATIVE_FIELDS = (
52
+ "id",
53
+ "name",
54
+ "status",
55
+ "thumbnail_url",
56
+ "object_story_spec",
57
+ "effective_object_story_id",
58
+ "call_to_action_type",
59
+ "object_type",
60
+ "template_url",
61
+ "url_tags",
62
+ "instagram_actor_id",
63
+ "product_set_id",
64
+ )
65
+
66
+ DEFAULT_LEAD_FIELDS = (
67
+ "id",
68
+ "created_time",
69
+ "ad_id",
70
+ "ad_name",
71
+ "adset_id",
72
+ "adset_name",
73
+ "campaign_id",
74
+ "campaign_name",
75
+ "form_id",
76
+ "field_data",
77
+ )
78
+
79
+ DEFAULT_INSIGHT_FIELDS = (
80
+ "campaign_id",
81
+ "adset_id",
82
+ "ad_id",
83
+ "date_start",
84
+ "date_stop",
85
+ "reach",
86
+ "impressions",
87
+ "frequency",
88
+ "clicks",
89
+ "unique_clicks",
90
+ "ctr",
91
+ "unique_ctr",
92
+ "cpc",
93
+ "cpm",
94
+ "cpp",
95
+ "spend",
96
+ "actions",
97
+ "action_values",
98
+ "cost_per_action_type",
99
+ "website_ctr",
100
+ "account_currency",
101
+ "ad_click_actions",
102
+ "ad_name",
103
+ "adset_name",
104
+ "campaign_name",
105
+ "country",
106
+ "dma",
107
+ "full_view_impressions",
108
+ "full_view_reach",
109
+ "inline_link_click_ctr",
110
+ "outbound_clicks",
111
+ "reach",
112
+ "social_spend",
113
+ "spend",
114
+ "website_ctr",
115
+ )
116
+
117
+ TInsightsLevels = Literal["account", "campaign", "adset", "ad"]
118
+
119
+ INSIGHTS_PRIMARY_KEY = ("campaign_id", "adset_id", "ad_id", "date_start")
120
+
121
+ ALL_STATES = {
122
+ "effective_status": [
123
+ "ACTIVE",
124
+ "PAUSED",
125
+ "DELETED",
126
+ "PENDING_REVIEW",
127
+ "DISAPPROVED",
128
+ "PREAPPROVED",
129
+ "PENDING_BILLING_INFO",
130
+ "CAMPAIGN_PAUSED",
131
+ "ARCHIVED",
132
+ "ADSET_PAUSED",
133
+ ]
134
+ }
135
+
136
+ TInsightsBreakdownOptions = Literal[
137
+ "ads_insights",
138
+ "ads_insights_age_and_gender",
139
+ "ads_insights_country",
140
+ "ads_insights_platform_and_device",
141
+ "ads_insights_region",
142
+ "ads_insights_dma",
143
+ "ads_insights_hourly_advertiser",
144
+ ]
145
+
146
+ ALL_ACTION_ATTRIBUTION_WINDOWS = (
147
+ "1d_click",
148
+ "7d_click",
149
+ "28d_click",
150
+ "1d_view",
151
+ "7d_view",
152
+ "28d_view",
153
+ )
154
+
155
+ ALL_ACTION_BREAKDOWNS = ("action_type", "action_target_id", "action_destination")
156
+
157
+ INSIGHTS_BREAKDOWNS_OPTIONS: Dict[TInsightsBreakdownOptions, Any] = {
158
+ "ads_insights": {"breakdowns": (), "fields": ()},
159
+ "ads_insights_age_and_gender": {
160
+ "breakdowns": ("age", "gender"),
161
+ "fields": ("age", "gender"),
162
+ },
163
+ "ads_insights_country": {"breakdowns": ("country",), "fields": ("country",)},
164
+ "ads_insights_platform_and_device": {
165
+ "breakdowns": ("publisher_platform", "platform_position", "impression_device"),
166
+ "fields": ("publisher_platform", "platform_position", "impression_device"),
167
+ },
168
+ "ads_insights_region": {"breakdowns": ("region",), "fields": ("region",)},
169
+ "ads_insights_dma": {"breakdowns": ("dma",), "fields": ("dma",)},
170
+ "ads_insights_hourly_advertiser": {
171
+ "breakdowns": ("hourly_stats_aggregated_by_advertiser_time_zone",),
172
+ "fields": ("hourly_stats_aggregated_by_advertiser_time_zone",),
173
+ },
174
+ }
175
+
176
+ INSIGHT_FIELDS_TYPES: TTableSchemaColumns = {
177
+ "campaign_id": {"data_type": "bigint"},
178
+ "adset_id": {"data_type": "bigint"},
179
+ "ad_id": {"data_type": "bigint"},
180
+ "date_start": {"data_type": "timestamp"},
181
+ "date_stop": {"data_type": "timestamp"},
182
+ "reach": {"data_type": "bigint"},
183
+ "impressions": {"data_type": "bigint"},
184
+ "frequency": {"data_type": "decimal"},
185
+ "clicks": {"data_type": "bigint"},
186
+ "unique_clicks": {"data_type": "bigint"},
187
+ "ctr": {"data_type": "decimal"},
188
+ "unique_ctr": {"data_type": "decimal"},
189
+ "cpc": {"data_type": "decimal"},
190
+ "cpm": {"data_type": "decimal"},
191
+ "cpp": {"data_type": "decimal"},
192
+ "spend": {"data_type": "decimal"},
193
+ }
194
+
195
+ INVALID_INSIGHTS_FIELDS = [
196
+ "impression_device",
197
+ "publisher_platform",
198
+ "platform_position",
199
+ "age",
200
+ "gender",
201
+ "country",
202
+ "placement",
203
+ "region",
204
+ "dma",
205
+ "hourly_stats_aggregated_by_advertiser_time_zone",
206
+ ]
207
+
208
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD = 37 # months
ingestr/src/factory.py CHANGED
@@ -15,11 +15,16 @@ from ingestr.src.destinations import (
15
15
  SynapseDestination,
16
16
  )
17
17
  from ingestr.src.sources import (
18
+ AdjustSource,
18
19
  AirtableSource,
20
+ AppsflyerSource,
19
21
  ChessSource,
22
+ FacebookAdsSource,
20
23
  GoogleSheetsSource,
21
24
  GorgiasSource,
22
25
  HubspotSource,
26
+ KafkaSource,
27
+ KlaviyoSource,
23
28
  LocalCsvSource,
24
29
  MongoDbSource,
25
30
  NotionSource,
@@ -111,12 +116,22 @@ class SourceDestinationFactory:
111
116
  return ChessSource()
112
117
  elif self.source_scheme == "stripe":
113
118
  return StripeAnalyticsSource()
119
+ elif self.source_scheme == "facebookads":
120
+ return FacebookAdsSource()
114
121
  elif self.source_scheme == "slack":
115
122
  return SlackSource()
116
123
  elif self.source_scheme == "hubspot":
117
124
  return HubspotSource()
118
125
  elif self.source_scheme == "airtable":
119
126
  return AirtableSource()
127
+ elif self.source_scheme == "klaviyo":
128
+ return KlaviyoSource()
129
+ elif self.source_scheme == "appsflyer":
130
+ return AppsflyerSource()
131
+ elif self.source_scheme == "kafka":
132
+ return KafkaSource()
133
+ elif self.source_scheme == "adjust":
134
+ return AdjustSource()
120
135
  else:
121
136
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
122
137
 
@@ -0,0 +1,103 @@
1
+ """A source to extract Kafka messages.
2
+
3
+ When extraction starts, partitions length is checked -
4
+ data is read only up to it, overriding the default Kafka's
5
+ behavior of waiting for new messages in endless loop.
6
+ """
7
+
8
+ from contextlib import closing
9
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
10
+
11
+ import dlt
12
+ from confluent_kafka import Consumer, Message # type: ignore
13
+ from dlt.common import logger
14
+ from dlt.common.time import ensure_pendulum_datetime
15
+ from dlt.common.typing import TAnyDateTime, TDataItem
16
+
17
+ from .helpers import (
18
+ KafkaCredentials,
19
+ OffsetTracker,
20
+ default_msg_processor,
21
+ )
22
+
23
+
24
+ @dlt.resource(
25
+ name="kafka_messages",
26
+ table_name=lambda msg: msg["_kafka"]["topic"],
27
+ standalone=True,
28
+ )
29
+ def kafka_consumer(
30
+ topics: Union[str, List[str]],
31
+ credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value,
32
+ msg_processor: Optional[
33
+ Callable[[Message], Dict[str, Any]]
34
+ ] = default_msg_processor,
35
+ batch_size: Optional[int] = 3000,
36
+ batch_timeout: Optional[int] = 3,
37
+ start_from: Optional[TAnyDateTime] = None,
38
+ ) -> Iterable[TDataItem]:
39
+ """Extract recent messages from the given Kafka topics.
40
+
41
+ The resource tracks offsets for all the topics and partitions,
42
+ and so reads data incrementally.
43
+
44
+ Messages from different topics are saved in different tables.
45
+
46
+ Args:
47
+ topics (Union[str, List[str]]): Names of topics to extract.
48
+ credentials (Optional[Union[KafkaCredentials, Consumer]]):
49
+ Auth credentials or an initiated Kafka consumer. By default,
50
+ is taken from secrets.
51
+ msg_processor(Optional[Callable]): A function-converter,
52
+ which'll process every Kafka message after it's read and
53
+ before it's transfered to the destination.
54
+ batch_size (Optional[int]): Messages batch size to read at once.
55
+ batch_timeout (Optional[int]): Maximum time to wait for a batch
56
+ consume, in seconds.
57
+ start_from (Optional[TAnyDateTime]): A timestamp, at which to start
58
+ reading. Older messages are ignored.
59
+
60
+ Yields:
61
+ Iterable[TDataItem]: Kafka messages.
62
+ """
63
+ if not isinstance(topics, list):
64
+ topics = [topics]
65
+
66
+ if isinstance(credentials, Consumer):
67
+ consumer = credentials
68
+ elif isinstance(credentials, KafkaCredentials):
69
+ consumer = credentials.init_consumer()
70
+ else:
71
+ raise TypeError(
72
+ (
73
+ "Wrong credentials type provided. Need to be of type: "
74
+ "KafkaCredentials or confluent_kafka.Consumer"
75
+ )
76
+ )
77
+
78
+ if start_from is not None:
79
+ start_from = ensure_pendulum_datetime(start_from)
80
+
81
+ tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from) # type: ignore
82
+
83
+ # read messages up to the maximum offsets,
84
+ # not waiting for new messages
85
+ with closing(consumer):
86
+ while tracker.has_unread:
87
+ messages = consumer.consume(batch_size, timeout=batch_timeout)
88
+ if not messages:
89
+ break
90
+
91
+ batch = []
92
+ for msg in messages:
93
+ if msg.error():
94
+ err = msg.error()
95
+ if err.retriable() or not err.fatal():
96
+ logger.warning(f"ERROR: {err} - RETRYING")
97
+ else:
98
+ raise err
99
+ else:
100
+ batch.append(msg_processor(msg)) # type: ignore
101
+ tracker.renew(msg)
102
+
103
+ yield batch