ingestr 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestr/main.py CHANGED
@@ -244,6 +244,13 @@ def ingest(
244
244
  envvar="PIPELINES_DIR",
245
245
  ),
246
246
  ] = None, # type: ignore
247
+ extract_parallelism: Annotated[
248
+ Optional[int],
249
+ typer.Option(
250
+ help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
251
+ envvar="EXTRACT_PARALLELISM",
252
+ ),
253
+ ] = 5, # type: ignore
247
254
  ):
248
255
  track(
249
256
  "command_triggered",
@@ -252,7 +259,10 @@ def ingest(
252
259
  },
253
260
  )
254
261
 
262
+ dlt.config["data_writer.buffer_max_items"] = page_size
255
263
  dlt.config["data_writer.file_max_items"] = loader_file_size
264
+ dlt.config["extract.workers"] = extract_parallelism
265
+ dlt.config["extract.max_parallel_items"] = extract_parallelism
256
266
  if schema_naming != SchemaNaming.default:
257
267
  dlt.config["schema.naming"] = schema_naming.value
258
268
 
ingestr/src/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ # ignore secrets, virtual environments and typical python compilation artifacts
2
+ secrets.toml
3
+ # ignore basic python artifacts
4
+ .env
5
+ **/__pycache__/
6
+ **/*.py[cod]
7
+ **/*$py.class
8
+ # ignore duckdb
9
+ *.duckdb
10
+ *.wal
@@ -0,0 +1,31 @@
1
+ from typing import Sequence
2
+
3
+ import dlt
4
+ from dlt.sources import DltResource
5
+
6
+ from .helpers import DEFAULT_DIMENSIONS, AdjustAPI
7
+
8
+
9
+ @dlt.source(max_table_nesting=0)
10
+ def adjust_source(
11
+ start_date: str,
12
+ end_date: str,
13
+ api_key: str,
14
+ ) -> Sequence[DltResource]:
15
+ @dlt.resource(write_disposition="merge", merge_key="day")
16
+ def campaigns():
17
+ adjust_api = AdjustAPI(api_key=api_key)
18
+ yield from adjust_api.fetch_report_data(
19
+ start_date=start_date,
20
+ end_date=end_date,
21
+ )
22
+
23
+ @dlt.resource(write_disposition="merge", merge_key="day")
24
+ def creatives():
25
+ dimensions = DEFAULT_DIMENSIONS + ["adgroup", "creative"]
26
+ adjust_api = AdjustAPI(api_key=api_key)
27
+ yield from adjust_api.fetch_report_data(
28
+ start_date=start_date, end_date=end_date, dimensions=dimensions
29
+ )
30
+
31
+ return campaigns, creatives
@@ -0,0 +1,82 @@
1
+ import requests
2
+ from dlt.sources.helpers.requests import Client
3
+ from requests.exceptions import HTTPError
4
+
5
+ DEFAULT_DIMENSIONS = ["campaign", "day", "app", "store_type", "channel", "country"]
6
+
7
+ DEFAULT_METRICS = [
8
+ "network_cost",
9
+ "all_revenue_total_d0",
10
+ "ad_revenue_total_d0",
11
+ "revenue_total_d0",
12
+ "all_revenue_total_d1",
13
+ "ad_revenue_total_d1",
14
+ "revenue_total_d1",
15
+ "all_revenue_total_d3",
16
+ "ad_revenue_total_d3",
17
+ "revenue_total_d3",
18
+ "all_revenue_total_d7",
19
+ "ad_revenue_total_d7",
20
+ "revenue_total_d7",
21
+ "all_revenue_total_d14",
22
+ "ad_revenue_total_d14",
23
+ "revenue_total_d14",
24
+ "all_revenue_total_d21",
25
+ ]
26
+
27
+
28
+ class AdjustAPI:
29
+ def __init__(self, api_key):
30
+ self.api_key = api_key
31
+ self.uri = "https://automate.adjust.com/reports-service/report"
32
+
33
+ def fetch_report_data(
34
+ self,
35
+ start_date,
36
+ end_date,
37
+ dimensions=DEFAULT_DIMENSIONS,
38
+ metrics=DEFAULT_METRICS,
39
+ utc_offset="+00:00",
40
+ ad_spend_mode="network",
41
+ attribution_source="first",
42
+ attribution_type="all",
43
+ cohort_maturity="immature",
44
+ reattributed="all",
45
+ sandbox="false",
46
+ ):
47
+ headers = {"Authorization": f"Bearer {self.api_key}"}
48
+ comma_separated_dimensions = ",".join(dimensions)
49
+ comma_separated_metrics = ",".join(metrics)
50
+ params = {
51
+ "date_period": f"{start_date}:{end_date}",
52
+ "dimensions": comma_separated_dimensions,
53
+ "metrics": comma_separated_metrics,
54
+ "utc_offset": utc_offset,
55
+ "ad_spend_mode": ad_spend_mode,
56
+ "attribution_source": attribution_source,
57
+ "attribution_type": attribution_type,
58
+ "cohort_maturity": cohort_maturity,
59
+ "reattributed": reattributed,
60
+ "sandbox": sandbox,
61
+ }
62
+
63
+ def retry_on_limit(
64
+ response: requests.Response, exception: BaseException
65
+ ) -> bool:
66
+ return response.status_code == 429
67
+
68
+ request_client = Client(
69
+ request_timeout=8.0,
70
+ raise_for_status=False,
71
+ retry_condition=retry_on_limit,
72
+ request_max_attempts=12,
73
+ request_backoff_factor=2,
74
+ ).session
75
+
76
+ response = request_client.get(self.uri, headers=headers, params=params)
77
+ if response.status_code == 200:
78
+ result = response.json()
79
+ items = result.get("rows", [])
80
+ yield items
81
+ else:
82
+ raise HTTPError(f"Request failed with status code: {response.status_code}")
@@ -0,0 +1,24 @@
1
+ from typing import Iterable
2
+
3
+ import dlt
4
+ from dlt.common.typing import TDataItem
5
+ from dlt.sources import DltResource
6
+
7
+ from ingestr.src.appsflyer.client import AppsflyerClient
8
+
9
+
10
+ @dlt.source(max_table_nesting=0)
11
+ def appsflyer_source(
12
+ api_key: str, start_date: str, end_date: str
13
+ ) -> Iterable[DltResource]:
14
+ client = AppsflyerClient(api_key)
15
+
16
+ @dlt.resource(write_disposition="merge", merge_key="install_time")
17
+ def campaigns() -> Iterable[TDataItem]:
18
+ yield from client.fetch_campaigns(start_date, end_date)
19
+
20
+ @dlt.resource(write_disposition="merge", merge_key="install_time")
21
+ def creatives() -> Iterable[TDataItem]:
22
+ yield from client.fetch_creatives(start_date, end_date)
23
+
24
+ return campaigns, creatives
@@ -0,0 +1,106 @@
1
+ from typing import Optional
2
+
3
+ import requests
4
+ from dlt.sources.helpers.requests import Client
5
+ from requests.exceptions import HTTPError
6
+
7
+ DEFAULT_GROUPING = ["c", "geo", "app_id", "install_time"]
8
+ DEFAULT_KPIS = [
9
+ "impressions",
10
+ "clicks",
11
+ "installs",
12
+ "cost",
13
+ "revenue",
14
+ "average_ecpi",
15
+ "loyal_users",
16
+ "uninstalls",
17
+ "roi",
18
+ ]
19
+
20
+
21
+ class AppsflyerClient:
22
+ def __init__(self, api_key: str):
23
+ self.api_key = api_key
24
+ self.uri = "https://hq1.appsflyer.com/api/master-agg-data/v4/app/all"
25
+
26
+ def __get_headers(self):
27
+ return {
28
+ "Authorization": f"{self.api_key}",
29
+ "accept": "text/json",
30
+ }
31
+
32
+ def _fetch_data(
33
+ self,
34
+ from_date: str,
35
+ to_date: str,
36
+ maximum_rows=1000000,
37
+ dimensions=DEFAULT_GROUPING,
38
+ metrics=DEFAULT_KPIS,
39
+ ):
40
+ params = {
41
+ "from": from_date,
42
+ "to": to_date,
43
+ "groupings": ",".join(dimensions),
44
+ "kpis": ",".join(metrics),
45
+ "format": "json",
46
+ "maximum_rows": maximum_rows,
47
+ }
48
+
49
+ def retry_on_limit(
50
+ response: Optional[requests.Response], exception: Optional[BaseException]
51
+ ) -> bool:
52
+ return (
53
+ isinstance(response, requests.Response) and response.status_code == 429
54
+ )
55
+
56
+ request_client = Client(
57
+ request_timeout=10.0,
58
+ raise_for_status=False,
59
+ retry_condition=retry_on_limit,
60
+ request_max_attempts=12,
61
+ request_backoff_factor=2,
62
+ ).session
63
+
64
+ try:
65
+ response = request_client.get(
66
+ url=self.uri, headers=self.__get_headers(), params=params
67
+ )
68
+
69
+ if response.status_code == 200:
70
+ result = response.json()
71
+ yield result
72
+ else:
73
+ raise HTTPError(
74
+ f"Request failed with status code: {response.status_code}"
75
+ )
76
+
77
+ except requests.RequestException as e:
78
+ raise HTTPError(f"Request failed: {e}")
79
+
80
+ def fetch_campaigns(
81
+ self,
82
+ start_date: str,
83
+ end_date: str,
84
+ ):
85
+ metrics = DEFAULT_KPIS + [
86
+ "cohort_day_1_revenue_per_user",
87
+ "cohort_day_1_total_revenue_per_user",
88
+ "cohort_day_3_revenue_per_user",
89
+ "cohort_day_3_total_revenue_per_user",
90
+ "cohort_day_7_total_revenue_per_user",
91
+ "cohort_day_7_revenue_per_user",
92
+ "cohort_day_14_total_revenue_per_user",
93
+ "cohort_day_14_revenue_per_user",
94
+ "cohort_day_21_total_revenue_per_user",
95
+ "cohort_day_21_revenue_per_user",
96
+ "retention_day_7",
97
+ ]
98
+ return self._fetch_data(start_date, end_date, metrics=metrics)
99
+
100
+ def fetch_creatives(
101
+ self,
102
+ start_date: str,
103
+ end_date: str,
104
+ ):
105
+ dimensions = DEFAULT_GROUPING + ["af_adset_id", "af_adset", "af_ad_id"]
106
+ return self._fetch_data(start_date, end_date, dimensions=dimensions)
@@ -0,0 +1,197 @@
1
+ """Loads campaigns, ads sets, ads, leads and insight data from Facebook Marketing API"""
2
+
3
+ from typing import Iterator, Sequence
4
+
5
+ import dlt
6
+ from dlt.common import pendulum
7
+ from dlt.common.typing import TDataItems
8
+ from dlt.sources import DltResource
9
+ from facebook_business.adobjects.ad import Ad
10
+
11
+ from .helpers import (
12
+ execute_job,
13
+ get_ads_account,
14
+ get_data_chunked,
15
+ get_start_date,
16
+ process_report_item,
17
+ )
18
+ from .settings import (
19
+ ALL_ACTION_ATTRIBUTION_WINDOWS,
20
+ ALL_ACTION_BREAKDOWNS,
21
+ DEFAULT_AD_FIELDS,
22
+ DEFAULT_ADCREATIVE_FIELDS,
23
+ DEFAULT_ADSET_FIELDS,
24
+ DEFAULT_CAMPAIGN_FIELDS,
25
+ DEFAULT_INSIGHT_FIELDS,
26
+ DEFAULT_LEAD_FIELDS,
27
+ INSIGHT_FIELDS_TYPES,
28
+ INSIGHTS_BREAKDOWNS_OPTIONS,
29
+ INSIGHTS_PRIMARY_KEY,
30
+ INVALID_INSIGHTS_FIELDS,
31
+ TInsightsBreakdownOptions,
32
+ TInsightsLevels,
33
+ )
34
+
35
+
36
+ @dlt.source(name="facebook_ads", max_table_nesting=0)
37
+ def facebook_ads_source(
38
+ account_id: str = dlt.config.value,
39
+ access_token: str = dlt.secrets.value,
40
+ chunk_size: int = 50,
41
+ request_timeout: float = 300.0,
42
+ app_api_version: str = "v20.0",
43
+ ) -> Sequence[DltResource]:
44
+ """Returns a list of resources to load campaigns, ad sets, ads, creatives and ad leads data from Facebook Marketing API.
45
+
46
+ All the resources have `replace` write disposition by default and define primary keys. Resources are parametrized and allow the user
47
+ to change the set of fields that will be loaded from the API and the object statuses that will be loaded. See the demonstration script for details.
48
+
49
+ You can convert the source into merge resource to keep the deleted objects. Currently Marketing API does not return deleted objects. See the demo script.
50
+
51
+ We also provide a transformation `enrich_ad_objects` that you can add to any of the resources to get additional data per object via `object.get_api`
52
+
53
+ Args:
54
+ account_id (str, optional): Account id associated with add manager. See README.md
55
+ access_token (str, optional): Access token associated with the Business Facebook App. See README.md
56
+ chunk_size (int, optional): A size of the page and batch request. You may need to decrease it if you request a lot of fields. Defaults to 50.
57
+ request_timeout (float, optional): Connection timeout. Defaults to 300.0.
58
+ app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
59
+
60
+ Returns:
61
+ Sequence[DltResource]: campaigns, ads, ad_sets, ad_creatives, leads
62
+ """
63
+ account = get_ads_account(
64
+ account_id, access_token, request_timeout, app_api_version
65
+ )
66
+
67
+ @dlt.resource(primary_key="id", write_disposition="replace")
68
+ def campaigns(
69
+ fields: Sequence[str] = DEFAULT_CAMPAIGN_FIELDS, states: Sequence[str] = None
70
+ ) -> Iterator[TDataItems]:
71
+ yield get_data_chunked(account.get_campaigns, fields, states, chunk_size)
72
+
73
+ @dlt.resource(primary_key="id", write_disposition="replace")
74
+ def ads(
75
+ fields: Sequence[str] = DEFAULT_AD_FIELDS, states: Sequence[str] = None
76
+ ) -> Iterator[TDataItems]:
77
+ yield get_data_chunked(account.get_ads, fields, states, chunk_size)
78
+
79
+ @dlt.resource(primary_key="id", write_disposition="replace")
80
+ def ad_sets(
81
+ fields: Sequence[str] = DEFAULT_ADSET_FIELDS, states: Sequence[str] = None
82
+ ) -> Iterator[TDataItems]:
83
+ yield get_data_chunked(account.get_ad_sets, fields, states, chunk_size)
84
+
85
+ @dlt.transformer(primary_key="id", write_disposition="replace", selected=True)
86
+ def leads(
87
+ items: TDataItems,
88
+ fields: Sequence[str] = DEFAULT_LEAD_FIELDS,
89
+ states: Sequence[str] = None,
90
+ ) -> Iterator[TDataItems]:
91
+ for item in items:
92
+ ad = Ad(item["id"])
93
+ yield get_data_chunked(ad.get_leads, fields, states, chunk_size)
94
+
95
+ @dlt.resource(primary_key="id", write_disposition="replace")
96
+ def ad_creatives(
97
+ fields: Sequence[str] = DEFAULT_ADCREATIVE_FIELDS, states: Sequence[str] = None
98
+ ) -> Iterator[TDataItems]:
99
+ yield get_data_chunked(account.get_ad_creatives, fields, states, chunk_size)
100
+
101
+ return campaigns, ads, ad_sets, ad_creatives, ads | leads
102
+
103
+
104
+ @dlt.source(name="facebook_ads", max_table_nesting=0)
105
+ def facebook_insights_source(
106
+ account_id: str = dlt.config.value,
107
+ access_token: str = dlt.secrets.value,
108
+ initial_load_past_days: int = 1,
109
+ fields: Sequence[str] = DEFAULT_INSIGHT_FIELDS,
110
+ attribution_window_days_lag: int = 7,
111
+ time_increment_days: int = 1,
112
+ breakdowns: TInsightsBreakdownOptions = "ads_insights",
113
+ action_breakdowns: Sequence[str] = ALL_ACTION_BREAKDOWNS,
114
+ level: TInsightsLevels = "ad",
115
+ action_attribution_windows: Sequence[str] = ALL_ACTION_ATTRIBUTION_WINDOWS,
116
+ batch_size: int = 50,
117
+ request_timeout: int = 300,
118
+ app_api_version: str = None,
119
+ ) -> DltResource:
120
+ """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
121
+
122
+ By default, the reports are generated one by one for each day, starting with today - attribution_window_days_lag. On subsequent runs, only the reports
123
+ from the last report date until today are loaded (incremental load). The reports from last 7 days (`attribution_window_days_lag`) are refreshed on each load to
124
+ account for changes during attribution window.
125
+
126
+ Mind that each report is a job and takes some time to execute.
127
+
128
+ Args:
129
+ account_id: str = dlt.config.value,
130
+ access_token: str = dlt.secrets.value,
131
+ initial_load_past_days (int, optional): How many past days (starting from today) to intially load. Defaults to 30.
132
+ fields (Sequence[str], optional): A list of fields to include in each reports. Note that `breakdowns` option adds fields automatically. Defaults to DEFAULT_INSIGHT_FIELDS.
133
+ attribution_window_days_lag (int, optional): Attribution window in days. The reports in attribution window are refreshed on each run.. Defaults to 7.
134
+ time_increment_days (int, optional): The report aggregation window in days. use 7 for weekly aggregation. Defaults to 1.
135
+ breakdowns (TInsightsBreakdownOptions, optional): A presents with common aggregations. See settings.py for details. Defaults to "ads_insights_age_and_gender".
136
+ action_breakdowns (Sequence[str], optional): Action aggregation types. See settings.py for details. Defaults to ALL_ACTION_BREAKDOWNS.
137
+ level (TInsightsLevels, optional): The granularity level. Defaults to "ad".
138
+ action_attribution_windows (Sequence[str], optional): Attribution windows for actions. Defaults to ALL_ACTION_ATTRIBUTION_WINDOWS.
139
+ batch_size (int, optional): Page size when reading data from particular report. Defaults to 50.
140
+ request_timeout (int, optional): Connection timeout. Defaults to 300.
141
+ app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
142
+
143
+ Returns:
144
+ DltResource: facebook_insights
145
+
146
+ """
147
+ account = get_ads_account(
148
+ account_id, access_token, request_timeout, app_api_version
149
+ )
150
+
151
+ # we load with a defined lag
152
+ initial_load_start_date = pendulum.today().subtract(days=initial_load_past_days)
153
+ initial_load_start_date_str = initial_load_start_date.isoformat()
154
+
155
+ @dlt.resource(
156
+ primary_key=INSIGHTS_PRIMARY_KEY,
157
+ write_disposition="merge",
158
+ columns=INSIGHT_FIELDS_TYPES,
159
+ )
160
+ def facebook_insights(
161
+ date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
162
+ "date_start", initial_value=initial_load_start_date_str
163
+ ),
164
+ ) -> Iterator[TDataItems]:
165
+ start_date = get_start_date(date_start, attribution_window_days_lag)
166
+ end_date = pendulum.now()
167
+
168
+ # fetch insights in incremental day steps
169
+ while start_date <= end_date:
170
+ query = {
171
+ "level": level,
172
+ "action_breakdowns": list(action_breakdowns),
173
+ "breakdowns": list(
174
+ INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["breakdowns"]
175
+ ),
176
+ "limit": batch_size,
177
+ "fields": list(
178
+ set(fields)
179
+ .union(INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["fields"])
180
+ .difference(INVALID_INSIGHTS_FIELDS)
181
+ ),
182
+ "time_increment": time_increment_days,
183
+ "action_attribution_windows": list(action_attribution_windows),
184
+ "time_ranges": [
185
+ {
186
+ "since": start_date.to_date_string(),
187
+ "until": start_date.add(
188
+ days=time_increment_days - 1
189
+ ).to_date_string(),
190
+ }
191
+ ],
192
+ }
193
+ job = execute_job(account.get_insights(params=query, is_async=True))
194
+ yield list(map(process_report_item, job.get_result()))
195
+ start_date = start_date.add(days=time_increment_days)
196
+
197
+ return facebook_insights
@@ -0,0 +1,5 @@
1
+ from dlt.extract.exceptions import DltResourceException
2
+
3
+
4
+ class InsightsJobTimeout(DltResourceException):
5
+ pass