ingestr 0.7.7__py3-none-any.whl → 0.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -244,6 +244,13 @@ def ingest(
244
244
  envvar="PIPELINES_DIR",
245
245
  ),
246
246
  ] = None, # type: ignore
247
+ extract_parallelism: Annotated[
248
+ Optional[int],
249
+ typer.Option(
250
+ help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
251
+ envvar="EXTRACT_PARALLELISM",
252
+ ),
253
+ ] = 5, # type: ignore
247
254
  ):
248
255
  track(
249
256
  "command_triggered",
@@ -253,6 +260,8 @@ def ingest(
253
260
  )
254
261
 
255
262
  dlt.config["data_writer.file_max_items"] = loader_file_size
263
+ dlt.config["extract.workers"] = extract_parallelism
264
+ dlt.config["extract.max_parallel_items"] = extract_parallelism
256
265
  if schema_naming != SchemaNaming.default:
257
266
  dlt.config["schema.naming"] = schema_naming.value
258
267
 
ingestr/src/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ # ignore secrets, virtual environments and typical python compilation artifacts
2
+ secrets.toml
3
+ # ignore basic python artifacts
4
+ .env
5
+ **/__pycache__/
6
+ **/*.py[cod]
7
+ **/*$py.class
8
+ # ignore duckdb
9
+ *.duckdb
10
+ *.wal
@@ -0,0 +1,197 @@
1
+ """Loads campaigns, ads sets, ads, leads and insight data from Facebook Marketing API"""
2
+
3
+ from typing import Iterator, Sequence
4
+
5
+ import dlt
6
+ from dlt.common import pendulum
7
+ from dlt.common.typing import TDataItems
8
+ from dlt.sources import DltResource
9
+ from facebook_business.adobjects.ad import Ad
10
+
11
+ from .helpers import (
12
+ execute_job,
13
+ get_ads_account,
14
+ get_data_chunked,
15
+ get_start_date,
16
+ process_report_item,
17
+ )
18
+ from .settings import (
19
+ ALL_ACTION_ATTRIBUTION_WINDOWS,
20
+ ALL_ACTION_BREAKDOWNS,
21
+ DEFAULT_AD_FIELDS,
22
+ DEFAULT_ADCREATIVE_FIELDS,
23
+ DEFAULT_ADSET_FIELDS,
24
+ DEFAULT_CAMPAIGN_FIELDS,
25
+ DEFAULT_INSIGHT_FIELDS,
26
+ DEFAULT_LEAD_FIELDS,
27
+ INSIGHT_FIELDS_TYPES,
28
+ INSIGHTS_BREAKDOWNS_OPTIONS,
29
+ INSIGHTS_PRIMARY_KEY,
30
+ INVALID_INSIGHTS_FIELDS,
31
+ TInsightsBreakdownOptions,
32
+ TInsightsLevels,
33
+ )
34
+
35
+
36
+ @dlt.source(name="facebook_ads", max_table_nesting=0)
37
+ def facebook_ads_source(
38
+ account_id: str = dlt.config.value,
39
+ access_token: str = dlt.secrets.value,
40
+ chunk_size: int = 50,
41
+ request_timeout: float = 300.0,
42
+ app_api_version: str = "v20.0",
43
+ ) -> Sequence[DltResource]:
44
+ """Returns a list of resources to load campaigns, ad sets, ads, creatives and ad leads data from Facebook Marketing API.
45
+
46
+ All the resources have `replace` write disposition by default and define primary keys. Resources are parametrized and allow the user
47
+ to change the set of fields that will be loaded from the API and the object statuses that will be loaded. See the demonstration script for details.
48
+
49
+ You can convert the source into merge resource to keep the deleted objects. Currently Marketing API does not return deleted objects. See the demo script.
50
+
51
+ We also provide a transformation `enrich_ad_objects` that you can add to any of the resources to get additional data per object via `object.get_api`
52
+
53
+ Args:
54
+ account_id (str, optional): Account id associated with add manager. See README.md
55
+ access_token (str, optional): Access token associated with the Business Facebook App. See README.md
56
+ chunk_size (int, optional): A size of the page and batch request. You may need to decrease it if you request a lot of fields. Defaults to 50.
57
+ request_timeout (float, optional): Connection timeout. Defaults to 300.0.
58
+ app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
59
+
60
+ Returns:
61
+ Sequence[DltResource]: campaigns, ads, ad_sets, ad_creatives, leads
62
+ """
63
+ account = get_ads_account(
64
+ account_id, access_token, request_timeout, app_api_version
65
+ )
66
+
67
+ @dlt.resource(primary_key="id", write_disposition="replace")
68
+ def campaigns(
69
+ fields: Sequence[str] = DEFAULT_CAMPAIGN_FIELDS, states: Sequence[str] = None
70
+ ) -> Iterator[TDataItems]:
71
+ yield get_data_chunked(account.get_campaigns, fields, states, chunk_size)
72
+
73
+ @dlt.resource(primary_key="id", write_disposition="replace")
74
+ def ads(
75
+ fields: Sequence[str] = DEFAULT_AD_FIELDS, states: Sequence[str] = None
76
+ ) -> Iterator[TDataItems]:
77
+ yield get_data_chunked(account.get_ads, fields, states, chunk_size)
78
+
79
+ @dlt.resource(primary_key="id", write_disposition="replace")
80
+ def ad_sets(
81
+ fields: Sequence[str] = DEFAULT_ADSET_FIELDS, states: Sequence[str] = None
82
+ ) -> Iterator[TDataItems]:
83
+ yield get_data_chunked(account.get_ad_sets, fields, states, chunk_size)
84
+
85
+ @dlt.transformer(primary_key="id", write_disposition="replace", selected=True)
86
+ def leads(
87
+ items: TDataItems,
88
+ fields: Sequence[str] = DEFAULT_LEAD_FIELDS,
89
+ states: Sequence[str] = None,
90
+ ) -> Iterator[TDataItems]:
91
+ for item in items:
92
+ ad = Ad(item["id"])
93
+ yield get_data_chunked(ad.get_leads, fields, states, chunk_size)
94
+
95
+ @dlt.resource(primary_key="id", write_disposition="replace")
96
+ def ad_creatives(
97
+ fields: Sequence[str] = DEFAULT_ADCREATIVE_FIELDS, states: Sequence[str] = None
98
+ ) -> Iterator[TDataItems]:
99
+ yield get_data_chunked(account.get_ad_creatives, fields, states, chunk_size)
100
+
101
+ return campaigns, ads, ad_sets, ad_creatives, ads | leads
102
+
103
+
104
+ @dlt.source(name="facebook_ads", max_table_nesting=0)
105
+ def facebook_insights_source(
106
+ account_id: str = dlt.config.value,
107
+ access_token: str = dlt.secrets.value,
108
+ initial_load_past_days: int = 1,
109
+ fields: Sequence[str] = DEFAULT_INSIGHT_FIELDS,
110
+ attribution_window_days_lag: int = 7,
111
+ time_increment_days: int = 1,
112
+ breakdowns: TInsightsBreakdownOptions = "ads_insights",
113
+ action_breakdowns: Sequence[str] = ALL_ACTION_BREAKDOWNS,
114
+ level: TInsightsLevels = "ad",
115
+ action_attribution_windows: Sequence[str] = ALL_ACTION_ATTRIBUTION_WINDOWS,
116
+ batch_size: int = 50,
117
+ request_timeout: int = 300,
118
+ app_api_version: str = None,
119
+ ) -> DltResource:
120
+ """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
121
+
122
+ By default, the reports are generated one by one for each day, starting with today - attribution_window_days_lag. On subsequent runs, only the reports
123
+ from the last report date until today are loaded (incremental load). The reports from last 7 days (`attribution_window_days_lag`) are refreshed on each load to
124
+ account for changes during attribution window.
125
+
126
+ Mind that each report is a job and takes some time to execute.
127
+
128
+ Args:
129
+ account_id: str = dlt.config.value,
130
+ access_token: str = dlt.secrets.value,
131
+ initial_load_past_days (int, optional): How many past days (starting from today) to intially load. Defaults to 30.
132
+ fields (Sequence[str], optional): A list of fields to include in each reports. Note that `breakdowns` option adds fields automatically. Defaults to DEFAULT_INSIGHT_FIELDS.
133
+ attribution_window_days_lag (int, optional): Attribution window in days. The reports in attribution window are refreshed on each run.. Defaults to 7.
134
+ time_increment_days (int, optional): The report aggregation window in days. use 7 for weekly aggregation. Defaults to 1.
135
+ breakdowns (TInsightsBreakdownOptions, optional): A presents with common aggregations. See settings.py for details. Defaults to "ads_insights_age_and_gender".
136
+ action_breakdowns (Sequence[str], optional): Action aggregation types. See settings.py for details. Defaults to ALL_ACTION_BREAKDOWNS.
137
+ level (TInsightsLevels, optional): The granularity level. Defaults to "ad".
138
+ action_attribution_windows (Sequence[str], optional): Attribution windows for actions. Defaults to ALL_ACTION_ATTRIBUTION_WINDOWS.
139
+ batch_size (int, optional): Page size when reading data from particular report. Defaults to 50.
140
+ request_timeout (int, optional): Connection timeout. Defaults to 300.
141
+ app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
142
+
143
+ Returns:
144
+ DltResource: facebook_insights
145
+
146
+ """
147
+ account = get_ads_account(
148
+ account_id, access_token, request_timeout, app_api_version
149
+ )
150
+
151
+ # we load with a defined lag
152
+ initial_load_start_date = pendulum.today().subtract(days=initial_load_past_days)
153
+ initial_load_start_date_str = initial_load_start_date.isoformat()
154
+
155
+ @dlt.resource(
156
+ primary_key=INSIGHTS_PRIMARY_KEY,
157
+ write_disposition="merge",
158
+ columns=INSIGHT_FIELDS_TYPES,
159
+ )
160
+ def facebook_insights(
161
+ date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
162
+ "date_start", initial_value=initial_load_start_date_str
163
+ ),
164
+ ) -> Iterator[TDataItems]:
165
+ start_date = get_start_date(date_start, attribution_window_days_lag)
166
+ end_date = pendulum.now()
167
+
168
+ # fetch insights in incremental day steps
169
+ while start_date <= end_date:
170
+ query = {
171
+ "level": level,
172
+ "action_breakdowns": list(action_breakdowns),
173
+ "breakdowns": list(
174
+ INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["breakdowns"]
175
+ ),
176
+ "limit": batch_size,
177
+ "fields": list(
178
+ set(fields)
179
+ .union(INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["fields"])
180
+ .difference(INVALID_INSIGHTS_FIELDS)
181
+ ),
182
+ "time_increment": time_increment_days,
183
+ "action_attribution_windows": list(action_attribution_windows),
184
+ "time_ranges": [
185
+ {
186
+ "since": start_date.to_date_string(),
187
+ "until": start_date.add(
188
+ days=time_increment_days - 1
189
+ ).to_date_string(),
190
+ }
191
+ ],
192
+ }
193
+ job = execute_job(account.get_insights(params=query, is_async=True))
194
+ yield list(map(process_report_item, job.get_result()))
195
+ start_date = start_date.add(days=time_increment_days)
196
+
197
+ return facebook_insights
@@ -0,0 +1,5 @@
1
+ from dlt.extract.exceptions import DltResourceException
2
+
3
+
4
+ class InsightsJobTimeout(DltResourceException):
5
+ pass
@@ -0,0 +1,255 @@
1
+ """Facebook ads source helpers"""
2
+
3
+ import functools
4
+ import itertools
5
+ import time
6
+ from typing import Any, Iterator, Sequence
7
+
8
+ import dlt
9
+ import humanize
10
+ import pendulum
11
+ from dlt.common import logger
12
+ from dlt.common.configuration.inject import with_config
13
+ from dlt.common.time import ensure_pendulum_datetime
14
+ from dlt.common.typing import DictStrAny, TDataItem, TDataItems
15
+ from dlt.sources.helpers import requests
16
+ from dlt.sources.helpers.requests import Client
17
+ from facebook_business import FacebookAdsApi
18
+ from facebook_business.adobjects.abstractcrudobject import AbstractCrudObject
19
+ from facebook_business.adobjects.abstractobject import AbstractObject
20
+ from facebook_business.adobjects.adaccount import AdAccount
21
+ from facebook_business.adobjects.user import User
22
+ from facebook_business.api import FacebookResponse
23
+
24
+ from .exceptions import InsightsJobTimeout
25
+ from .settings import (
26
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD,
27
+ INSIGHTS_PRIMARY_KEY,
28
+ TFbMethod,
29
+ )
30
+
31
+
32
+ def get_start_date(
33
+ incremental_start_date: dlt.sources.incremental[str],
34
+ attribution_window_days_lag: int = 7,
35
+ ) -> pendulum.DateTime:
36
+ """
37
+ Get the start date for incremental loading of Facebook Insights data.
38
+ """
39
+ start_date: pendulum.DateTime = ensure_pendulum_datetime(
40
+ incremental_start_date.start_value
41
+ ).subtract(days=attribution_window_days_lag)
42
+
43
+ # facebook forgets insights so trim the lag and warn
44
+ min_start_date = pendulum.today().subtract(
45
+ months=FACEBOOK_INSIGHTS_RETENTION_PERIOD
46
+ )
47
+ if start_date < min_start_date:
48
+ logger.warning(
49
+ "%s: Start date is earlier than %s months ago, using %s instead. "
50
+ "For more information, see https://www.facebook.com/business/help/1695754927158071?id=354406972049255",
51
+ "facebook_insights",
52
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD,
53
+ min_start_date,
54
+ )
55
+ start_date = min_start_date
56
+ incremental_start_date.start_value = min_start_date
57
+
58
+ # lag the incremental start date by attribution window lag
59
+ incremental_start_date.start_value = start_date.isoformat()
60
+ return start_date
61
+
62
+
63
+ def process_report_item(item: AbstractObject) -> DictStrAny:
64
+ d: DictStrAny = item.export_all_data()
65
+ for pki in INSIGHTS_PRIMARY_KEY:
66
+ if pki not in d:
67
+ d[pki] = "no_" + pki
68
+
69
+ return d
70
+
71
+
72
+ def get_data_chunked(
73
+ method: TFbMethod, fields: Sequence[str], states: Sequence[str], chunk_size: int
74
+ ) -> Iterator[TDataItems]:
75
+ # add pagination and chunk into lists
76
+ params: DictStrAny = {"limit": chunk_size}
77
+ if states:
78
+ params.update({"effective_status": states})
79
+ it: map[DictStrAny] = map(
80
+ lambda c: c.export_all_data(), method(fields=fields, params=params)
81
+ )
82
+ while True:
83
+ chunk = list(itertools.islice(it, chunk_size))
84
+ if not chunk:
85
+ break
86
+ yield chunk
87
+
88
+
89
+ def enrich_ad_objects(fb_obj_type: AbstractObject, fields: Sequence[str]) -> Any:
90
+ """Returns a transformation that will enrich any of the resources returned by `` with additional fields
91
+
92
+ In example below we add "thumbnail_url" to all objects loaded by `ad_creatives` resource:
93
+ >>> fb_ads = facebook_ads_source()
94
+ >>> fb_ads.ad_creatives.add_step(enrich_ad_objects(AdCreative, ["thumbnail_url"]))
95
+
96
+ Internally, the method uses batch API to get data efficiently. Refer to demo script for full examples
97
+
98
+ Args:
99
+ fb_obj_type (AbstractObject): A Facebook Business object type (Ad, Campaign, AdSet, AdCreative, Lead). Import those types from this module
100
+ fields (Sequence[str]): A list/tuple of fields to add to each object.
101
+
102
+ Returns:
103
+ ItemTransformFunctionWithMeta[TDataItems]: A transformation function to be added to a resource with `add_step` method
104
+ """
105
+
106
+ def _wrap(items: TDataItems, meta: Any = None) -> TDataItems:
107
+ api_batch = FacebookAdsApi.get_default_api().new_batch()
108
+
109
+ def update_item(resp: FacebookResponse, item: TDataItem) -> None:
110
+ item.update(resp.json())
111
+
112
+ def fail(resp: FacebookResponse) -> None:
113
+ raise resp.error()
114
+
115
+ for item in items:
116
+ o: AbstractCrudObject = fb_obj_type(item["id"])
117
+ o.api_get(
118
+ fields=fields,
119
+ batch=api_batch,
120
+ success=functools.partial(update_item, item=item),
121
+ failure=fail,
122
+ )
123
+ api_batch.execute()
124
+ return items
125
+
126
+ return _wrap
127
+
128
+
129
+ JOB_TIMEOUT_INFO = """This is an intermittent error and may resolve itself on subsequent queries to the Facebook API.
130
+ You should remove the fields in `fields` argument that are not necessary, as that may help improve the reliability of the Facebook API."""
131
+
132
+
133
+ def execute_job(
134
+ job: AbstractCrudObject,
135
+ insights_max_wait_to_start_seconds: int = 5 * 60,
136
+ insights_max_wait_to_finish_seconds: int = 30 * 60,
137
+ insights_max_async_sleep_seconds: int = 5 * 60,
138
+ ) -> AbstractCrudObject:
139
+ status: str = None
140
+ time_start = time.time()
141
+ sleep_time = 10
142
+ while status != "Job Completed":
143
+ duration = time.time() - time_start
144
+ job = job.api_get()
145
+ status = job["async_status"]
146
+ percent_complete = job["async_percent_completion"]
147
+
148
+ job_id = job["id"]
149
+ logger.info("%s, %d%% done", status, percent_complete)
150
+
151
+ if status == "Job Completed":
152
+ return job
153
+
154
+ if duration > insights_max_wait_to_start_seconds and percent_complete == 0:
155
+ pretty_error_message = (
156
+ "Insights job {} did not start after {} seconds. " + JOB_TIMEOUT_INFO
157
+ )
158
+ raise InsightsJobTimeout(
159
+ "facebook_insights",
160
+ pretty_error_message.format(job_id, insights_max_wait_to_start_seconds),
161
+ )
162
+ elif (
163
+ duration > insights_max_wait_to_finish_seconds and status != "Job Completed"
164
+ ):
165
+ pretty_error_message = (
166
+ "Insights job {} did not complete after {} seconds. " + JOB_TIMEOUT_INFO
167
+ )
168
+ raise InsightsJobTimeout(
169
+ "facebook_insights",
170
+ pretty_error_message.format(
171
+ job_id, insights_max_wait_to_finish_seconds // 60
172
+ ),
173
+ )
174
+
175
+ logger.info("sleeping for %d seconds until job is done", sleep_time)
176
+ time.sleep(sleep_time)
177
+ if sleep_time < insights_max_async_sleep_seconds:
178
+ sleep_time = 2 * sleep_time
179
+ return job
180
+
181
+
182
+ def get_ads_account(
183
+ account_id: str, access_token: str, request_timeout: float, app_api_version: str
184
+ ) -> AdAccount:
185
+ notify_on_token_expiration()
186
+
187
+ def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
188
+ try:
189
+ error = response.json()["error"]
190
+ code = error["code"]
191
+ message = error["message"]
192
+ should_retry = code in (
193
+ 1,
194
+ 2,
195
+ 4,
196
+ 17,
197
+ 341,
198
+ 32,
199
+ 613,
200
+ *range(80000, 80007),
201
+ 800008,
202
+ 800009,
203
+ 80014,
204
+ )
205
+ if should_retry:
206
+ logger.warning(
207
+ "facebook_ads source will retry due to %s with error code %i"
208
+ % (message, code)
209
+ )
210
+ return should_retry
211
+ except Exception:
212
+ return False
213
+
214
+ retry_session = Client(
215
+ request_timeout=request_timeout,
216
+ raise_for_status=False,
217
+ retry_condition=retry_on_limit,
218
+ request_max_attempts=12,
219
+ request_backoff_factor=2,
220
+ ).session
221
+ retry_session.params.update({"access_token": access_token}) # type: ignore
222
+ # patch dlt requests session with retries
223
+ API = FacebookAdsApi.init(
224
+ account_id="act_" + account_id,
225
+ access_token=access_token,
226
+ api_version=app_api_version,
227
+ )
228
+ API._session.requests = retry_session
229
+ user = User(fbid="me")
230
+
231
+ accounts = user.get_ad_accounts()
232
+ account: AdAccount = None
233
+ for acc in accounts:
234
+ if acc["account_id"] == account_id:
235
+ account = acc
236
+
237
+ if not account:
238
+ raise ValueError("Couldn't find account with id {}".format(account_id))
239
+
240
+ return account
241
+
242
+
243
+ @with_config(sections=("sources", "facebook_ads"))
244
+ def notify_on_token_expiration(access_token_expires_at: int = None) -> None:
245
+ """Notifies (currently via logger) if access token expires in less than 7 days. Needs `access_token_expires_at` to be configured."""
246
+ if not access_token_expires_at:
247
+ logger.warning(
248
+ "Token expiration time notification disabled. Configure token expiration timestamp in access_token_expires_at config value"
249
+ )
250
+ else:
251
+ expires_at = pendulum.from_timestamp(access_token_expires_at)
252
+ if expires_at < pendulum.now().add(days=7):
253
+ logger.error(
254
+ f"Access Token expires in {humanize.precisedelta(pendulum.now() - expires_at)}. Replace the token now!"
255
+ )