PyPI - ingestr - Versions diffs - 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

ingestr 0.7.7py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ingestr/main.py +10 -0
ingestr/src/.gitignore +10 -0
ingestr/src/adjust/_init_.py +31 -0
ingestr/src/adjust/helpers.py +82 -0
ingestr/src/appsflyer/_init_.py +24 -0
ingestr/src/appsflyer/client.py +106 -0
ingestr/src/facebook_ads/__init__.py +197 -0
ingestr/src/facebook_ads/exceptions.py +5 -0
ingestr/src/facebook_ads/helpers.py +255 -0
ingestr/src/facebook_ads/settings.py +208 -0
ingestr/src/factory.py +15 -0
ingestr/src/kafka/__init__.py +103 -0
ingestr/src/kafka/helpers.py +227 -0
ingestr/src/klaviyo/_init_.py +173 -0
ingestr/src/klaviyo/client.py +212 -0
ingestr/src/klaviyo/helpers.py +19 -0
ingestr/src/shopify/__init__.py +1752 -54
ingestr/src/shopify/helpers.py +73 -32
ingestr/src/sources.py +230 -7
ingestr/src/version.py +1 -1
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/METADATA +22 -1
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/RECORD +25 -11
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/WHEEL +0 -0
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/entry_points.txt +0 -0
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -244,6 +244,13 @@ def ingest(
             envvar="PIPELINES_DIR",
         ),
     ] = None,  # type: ignore
+    extract_parallelism: Annotated[
+        Optional[int],
+        typer.Option(
+            help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
+            envvar="EXTRACT_PARALLELISM",
+        ),
+    ] = 5,  # type: ignore
 ):
     track(
         "command_triggered",
@@ -252,7 +259,10 @@ def ingest(
         },
     )
+    dlt.config["data_writer.buffer_max_items"] = page_size
     dlt.config["data_writer.file_max_items"] = loader_file_size
+    dlt.config["extract.workers"] = extract_parallelism
+    dlt.config["extract.max_parallel_items"] = extract_parallelism
     if schema_naming != SchemaNaming.default:
         dlt.config["schema.naming"] = schema_naming.value

ingestr/src/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+# ignore secrets, virtual environments and typical python compilation artifacts
+secrets.toml
+# ignore basic python artifacts
+.env
+**/__pycache__/
+**/*.py[cod]
+**/*$py.class
+# ignore duckdb
+*.duckdb
+*.wal

ingestr/src/adjust/_init_.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Sequence
+import dlt
+from dlt.sources import DltResource
+from .helpers import DEFAULT_DIMENSIONS, AdjustAPI
+@dlt.source(max_table_nesting=0)
+def adjust_source(
+    start_date: str,
+    end_date: str,
+    api_key: str,
+) -> Sequence[DltResource]:
+    @dlt.resource(write_disposition="merge", merge_key="day")
+    def campaigns():
+        adjust_api = AdjustAPI(api_key=api_key)
+        yield from adjust_api.fetch_report_data(
+            start_date=start_date,
+            end_date=end_date,
+        )
+    @dlt.resource(write_disposition="merge", merge_key="day")
+    def creatives():
+        dimensions = DEFAULT_DIMENSIONS + ["adgroup", "creative"]
+        adjust_api = AdjustAPI(api_key=api_key)
+        yield from adjust_api.fetch_report_data(
+            start_date=start_date, end_date=end_date, dimensions=dimensions
+        )
+    return campaigns, creatives

ingestr/src/adjust/helpers.py ADDED Viewed

@@ -0,0 +1,82 @@
+import requests
+from dlt.sources.helpers.requests import Client
+from requests.exceptions import HTTPError
+DEFAULT_DIMENSIONS = ["campaign", "day", "app", "store_type", "channel", "country"]
+DEFAULT_METRICS = [
+    "network_cost",
+    "all_revenue_total_d0",
+    "ad_revenue_total_d0",
+    "revenue_total_d0",
+    "all_revenue_total_d1",
+    "ad_revenue_total_d1",
+    "revenue_total_d1",
+    "all_revenue_total_d3",
+    "ad_revenue_total_d3",
+    "revenue_total_d3",
+    "all_revenue_total_d7",
+    "ad_revenue_total_d7",
+    "revenue_total_d7",
+    "all_revenue_total_d14",
+    "ad_revenue_total_d14",
+    "revenue_total_d14",
+    "all_revenue_total_d21",
+]
+class AdjustAPI:
+    def __init__(self, api_key):
+        self.api_key = api_key
+        self.uri = "https://automate.adjust.com/reports-service/report"
+    def fetch_report_data(
+        self,
+        start_date,
+        end_date,
+        dimensions=DEFAULT_DIMENSIONS,
+        metrics=DEFAULT_METRICS,
+        utc_offset="+00:00",
+        ad_spend_mode="network",
+        attribution_source="first",
+        attribution_type="all",
+        cohort_maturity="immature",
+        reattributed="all",
+        sandbox="false",
+    ):
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        comma_separated_dimensions = ",".join(dimensions)
+        comma_separated_metrics = ",".join(metrics)
+        params = {
+            "date_period": f"{start_date}:{end_date}",
+            "dimensions": comma_separated_dimensions,
+            "metrics": comma_separated_metrics,
+            "utc_offset": utc_offset,
+            "ad_spend_mode": ad_spend_mode,
+            "attribution_source": attribution_source,
+            "attribution_type": attribution_type,
+            "cohort_maturity": cohort_maturity,
+            "reattributed": reattributed,
+            "sandbox": sandbox,
+        }
+        def retry_on_limit(
+            response: requests.Response, exception: BaseException
+        ) -> bool:
+            return response.status_code == 429
+        request_client = Client(
+            request_timeout=8.0,
+            raise_for_status=False,
+            retry_condition=retry_on_limit,
+            request_max_attempts=12,
+            request_backoff_factor=2,
+        ).session
+        response = request_client.get(self.uri, headers=headers, params=params)
+        if response.status_code == 200:
+            result = response.json()
+            items = result.get("rows", [])
+            yield items
+        else:
+            raise HTTPError(f"Request failed with status code: {response.status_code}")

ingestr/src/appsflyer/_init_.py ADDED Viewed

@@ -0,0 +1,24 @@
+from typing import Iterable
+import dlt
+from dlt.common.typing import TDataItem
+from dlt.sources import DltResource
+from ingestr.src.appsflyer.client import AppsflyerClient
+@dlt.source(max_table_nesting=0)
+def appsflyer_source(
+    api_key: str, start_date: str, end_date: str
+) -> Iterable[DltResource]:
+    client = AppsflyerClient(api_key)
+    @dlt.resource(write_disposition="merge", merge_key="install_time")
+    def campaigns() -> Iterable[TDataItem]:
+        yield from client.fetch_campaigns(start_date, end_date)
+    @dlt.resource(write_disposition="merge", merge_key="install_time")
+    def creatives() -> Iterable[TDataItem]:
+        yield from client.fetch_creatives(start_date, end_date)
+    return campaigns, creatives

ingestr/src/appsflyer/client.py ADDED Viewed

@@ -0,0 +1,106 @@
+from typing import Optional
+import requests
+from dlt.sources.helpers.requests import Client
+from requests.exceptions import HTTPError
+DEFAULT_GROUPING = ["c", "geo", "app_id", "install_time"]
+DEFAULT_KPIS = [
+    "impressions",
+    "clicks",
+    "installs",
+    "cost",
+    "revenue",
+    "average_ecpi",
+    "loyal_users",
+    "uninstalls",
+    "roi",
+]
+class AppsflyerClient:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.uri = "https://hq1.appsflyer.com/api/master-agg-data/v4/app/all"
+    def __get_headers(self):
+        return {
+            "Authorization": f"{self.api_key}",
+            "accept": "text/json",
+        }
+    def _fetch_data(
+        self,
+        from_date: str,
+        to_date: str,
+        maximum_rows=1000000,
+        dimensions=DEFAULT_GROUPING,
+        metrics=DEFAULT_KPIS,
+    ):
+        params = {
+            "from": from_date,
+            "to": to_date,
+            "groupings": ",".join(dimensions),
+            "kpis": ",".join(metrics),
+            "format": "json",
+            "maximum_rows": maximum_rows,
+        }
+        def retry_on_limit(
+            response: Optional[requests.Response], exception: Optional[BaseException]
+        ) -> bool:
+            return (
+                isinstance(response, requests.Response) and response.status_code == 429
+            )
+        request_client = Client(
+            request_timeout=10.0,
+            raise_for_status=False,
+            retry_condition=retry_on_limit,
+            request_max_attempts=12,
+            request_backoff_factor=2,
+        ).session
+        try:
+            response = request_client.get(
+                url=self.uri, headers=self.__get_headers(), params=params
+            )
+            if response.status_code == 200:
+                result = response.json()
+                yield result
+            else:
+                raise HTTPError(
+                    f"Request failed with status code: {response.status_code}"
+                )
+        except requests.RequestException as e:
+            raise HTTPError(f"Request failed: {e}")
+    def fetch_campaigns(
+        self,
+        start_date: str,
+        end_date: str,
+    ):
+        metrics = DEFAULT_KPIS + [
+            "cohort_day_1_revenue_per_user",
+            "cohort_day_1_total_revenue_per_user",
+            "cohort_day_3_revenue_per_user",
+            "cohort_day_3_total_revenue_per_user",
+            "cohort_day_7_total_revenue_per_user",
+            "cohort_day_7_revenue_per_user",
+            "cohort_day_14_total_revenue_per_user",
+            "cohort_day_14_revenue_per_user",
+            "cohort_day_21_total_revenue_per_user",
+            "cohort_day_21_revenue_per_user",
+            "retention_day_7",
+        ]
+        return self._fetch_data(start_date, end_date, metrics=metrics)
+    def fetch_creatives(
+        self,
+        start_date: str,
+        end_date: str,
+    ):
+        dimensions = DEFAULT_GROUPING + ["af_adset_id", "af_adset", "af_ad_id"]
+        return self._fetch_data(start_date, end_date, dimensions=dimensions)

ingestr/src/facebook_ads/__init__.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Loads campaigns, ads sets, ads, leads and insight data from Facebook Marketing API"""
+from typing import Iterator, Sequence
+import dlt
+from dlt.common import pendulum
+from dlt.common.typing import TDataItems
+from dlt.sources import DltResource
+from facebook_business.adobjects.ad import Ad
+from .helpers import (
+    execute_job,
+    get_ads_account,
+    get_data_chunked,
+    get_start_date,
+    process_report_item,
+)
+from .settings import (
+    ALL_ACTION_ATTRIBUTION_WINDOWS,
+    ALL_ACTION_BREAKDOWNS,
+    DEFAULT_AD_FIELDS,
+    DEFAULT_ADCREATIVE_FIELDS,
+    DEFAULT_ADSET_FIELDS,
+    DEFAULT_CAMPAIGN_FIELDS,
+    DEFAULT_INSIGHT_FIELDS,
+    DEFAULT_LEAD_FIELDS,
+    INSIGHT_FIELDS_TYPES,
+    INSIGHTS_BREAKDOWNS_OPTIONS,
+    INSIGHTS_PRIMARY_KEY,
+    INVALID_INSIGHTS_FIELDS,
+    TInsightsBreakdownOptions,
+    TInsightsLevels,
+)
+@dlt.source(name="facebook_ads", max_table_nesting=0)
+def facebook_ads_source(
+    account_id: str = dlt.config.value,
+    access_token: str = dlt.secrets.value,
+    chunk_size: int = 50,
+    request_timeout: float = 300.0,
+    app_api_version: str = "v20.0",
+) -> Sequence[DltResource]:
+    """Returns a list of resources to load campaigns, ad sets, ads, creatives and ad leads data from Facebook Marketing API.
+    All the resources have `replace` write disposition by default and define primary keys. Resources are parametrized and allow the user
+    to change the set of fields that will be loaded from the API and the object statuses that will be loaded. See the demonstration script for details.
+    You can convert the source into merge resource to keep the deleted objects. Currently Marketing API does not return deleted objects. See the demo script.
+    We also provide a transformation `enrich_ad_objects` that you can add to any of the resources to get additional data per object via `object.get_api`
+    Args:
+        account_id (str, optional): Account id associated with add manager. See README.md
+        access_token (str, optional): Access token associated with the Business Facebook App. See README.md
+        chunk_size (int, optional): A size of the page and batch request. You may need to decrease it if you request a lot of fields. Defaults to 50.
+        request_timeout (float, optional): Connection timeout. Defaults to 300.0.
+        app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
+    Returns:
+        Sequence[DltResource]: campaigns, ads, ad_sets, ad_creatives, leads
+    """
+    account = get_ads_account(
+        account_id, access_token, request_timeout, app_api_version
+    )
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def campaigns(
+        fields: Sequence[str] = DEFAULT_CAMPAIGN_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_campaigns, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ads(
+        fields: Sequence[str] = DEFAULT_AD_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ads, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ad_sets(
+        fields: Sequence[str] = DEFAULT_ADSET_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ad_sets, fields, states, chunk_size)
+    @dlt.transformer(primary_key="id", write_disposition="replace", selected=True)
+    def leads(
+        items: TDataItems,
+        fields: Sequence[str] = DEFAULT_LEAD_FIELDS,
+        states: Sequence[str] = None,
+    ) -> Iterator[TDataItems]:
+        for item in items:
+            ad = Ad(item["id"])
+            yield get_data_chunked(ad.get_leads, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ad_creatives(
+        fields: Sequence[str] = DEFAULT_ADCREATIVE_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ad_creatives, fields, states, chunk_size)
+    return campaigns, ads, ad_sets, ad_creatives, ads | leads
+@dlt.source(name="facebook_ads", max_table_nesting=0)
+def facebook_insights_source(
+    account_id: str = dlt.config.value,
+    access_token: str = dlt.secrets.value,
+    initial_load_past_days: int = 1,
+    fields: Sequence[str] = DEFAULT_INSIGHT_FIELDS,
+    attribution_window_days_lag: int = 7,
+    time_increment_days: int = 1,
+    breakdowns: TInsightsBreakdownOptions = "ads_insights",
+    action_breakdowns: Sequence[str] = ALL_ACTION_BREAKDOWNS,
+    level: TInsightsLevels = "ad",
+    action_attribution_windows: Sequence[str] = ALL_ACTION_ATTRIBUTION_WINDOWS,
+    batch_size: int = 50,
+    request_timeout: int = 300,
+    app_api_version: str = None,
+) -> DltResource:
+    """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
+    By default, the reports are generated one by one for each day, starting with today - attribution_window_days_lag. On subsequent runs, only the reports
+    from the last report date until today are loaded (incremental load). The reports from last 7 days (`attribution_window_days_lag`) are refreshed on each load to
+    account for changes during attribution window.
+    Mind that each report is a job and takes some time to execute.
+    Args:
+        account_id: str = dlt.config.value,
+        access_token: str = dlt.secrets.value,
+        initial_load_past_days (int, optional): How many past days (starting from today) to intially load. Defaults to 30.
+        fields (Sequence[str], optional): A list of fields to include in each reports. Note that `breakdowns` option adds fields automatically. Defaults to DEFAULT_INSIGHT_FIELDS.
+        attribution_window_days_lag (int, optional): Attribution window in days. The reports in attribution window are refreshed on each run.. Defaults to 7.
+        time_increment_days (int, optional): The report aggregation window in days. use 7 for weekly aggregation. Defaults to 1.
+        breakdowns (TInsightsBreakdownOptions, optional): A presents with common aggregations. See settings.py for details. Defaults to "ads_insights_age_and_gender".
+        action_breakdowns (Sequence[str], optional): Action aggregation types. See settings.py for details. Defaults to ALL_ACTION_BREAKDOWNS.
+        level (TInsightsLevels, optional): The granularity level. Defaults to "ad".
+        action_attribution_windows (Sequence[str], optional): Attribution windows for actions. Defaults to ALL_ACTION_ATTRIBUTION_WINDOWS.
+        batch_size (int, optional): Page size when reading data from particular report. Defaults to 50.
+        request_timeout (int, optional): Connection timeout. Defaults to 300.
+        app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
+    Returns:
+        DltResource: facebook_insights
+    """
+    account = get_ads_account(
+        account_id, access_token, request_timeout, app_api_version
+    )
+    # we load with a defined lag
+    initial_load_start_date = pendulum.today().subtract(days=initial_load_past_days)
+    initial_load_start_date_str = initial_load_start_date.isoformat()
+    @dlt.resource(
+        primary_key=INSIGHTS_PRIMARY_KEY,
+        write_disposition="merge",
+        columns=INSIGHT_FIELDS_TYPES,
+    )
+    def facebook_insights(
+        date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "date_start", initial_value=initial_load_start_date_str
+        ),
+    ) -> Iterator[TDataItems]:
+        start_date = get_start_date(date_start, attribution_window_days_lag)
+        end_date = pendulum.now()
+        # fetch insights in incremental day steps
+        while start_date <= end_date:
+            query = {
+                "level": level,
+                "action_breakdowns": list(action_breakdowns),
+                "breakdowns": list(
+                    INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["breakdowns"]
+                ),
+                "limit": batch_size,
+                "fields": list(
+                    set(fields)
+                    .union(INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["fields"])
+                    .difference(INVALID_INSIGHTS_FIELDS)
+                ),
+                "time_increment": time_increment_days,
+                "action_attribution_windows": list(action_attribution_windows),
+                "time_ranges": [
+                    {
+                        "since": start_date.to_date_string(),
+                        "until": start_date.add(
+                            days=time_increment_days - 1
+                        ).to_date_string(),
+                    }
+                ],
+            }
+            job = execute_job(account.get_insights(params=query, is_async=True))
+            yield list(map(process_report_item, job.get_result()))
+            start_date = start_date.add(days=time_increment_days)
+    return facebook_insights

ingestr/src/facebook_ads/exceptions.py ADDED Viewed

@@ -0,0 +1,5 @@
+from dlt.extract.exceptions import DltResourceException
+class InsightsJobTimeout(DltResourceException):
+    pass

ingestr 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl

ingestr 0.7.7py3-none-any.whl → 0.8.1py3-none-any.whl