PyPI - ingestr - Versions diffs - 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl - Mend

ingestr 0.7.6py3-none-any.whl → 0.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (23) hide show

ingestr/main.py +11 -1
ingestr/src/.gitignore +10 -0
ingestr/src/airtable/__init__.py +69 -0
ingestr/src/facebook_ads/__init__.py +197 -0
ingestr/src/facebook_ads/exceptions.py +5 -0
ingestr/src/facebook_ads/helpers.py +255 -0
ingestr/src/facebook_ads/settings.py +208 -0
ingestr/src/factory.py +15 -0
ingestr/src/kafka/__init__.py +103 -0
ingestr/src/kafka/helpers.py +227 -0
ingestr/src/klaviyo/_init_.py +173 -0
ingestr/src/klaviyo/client.py +212 -0
ingestr/src/klaviyo/helpers.py +19 -0
ingestr/src/slack/__init__.py +272 -0
ingestr/src/slack/helpers.py +204 -0
ingestr/src/slack/settings.py +22 -0
ingestr/src/sources.py +222 -1
ingestr/src/version.py +1 -1
{ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/METADATA +31 -5
{ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/RECORD +23 -9
{ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/WHEEL +0 -0
{ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/entry_points.txt +0 -0
{ingestr-0.7.6.dist-info → ingestr-0.7.8.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import hashlib
+import tempfile
 from datetime import datetime
 from enum import Enum
-import tempfile
 from typing import Optional
 import dlt
@@ -244,6 +244,13 @@ def ingest(
             envvar="PIPELINES_DIR",
         ),
     ] = None,  # type: ignore
+    extract_parallelism: Annotated[
+        Optional[int],
+        typer.Option(
+            help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
+            envvar="EXTRACT_PARALLELISM",
+        ),
+    ] = 5,  # type: ignore
 ):
     track(
         "command_triggered",
@@ -253,6 +260,8 @@ def ingest(
     )
     dlt.config["data_writer.file_max_items"] = loader_file_size
+    dlt.config["extract.workers"] = extract_parallelism
+    dlt.config["extract.max_parallel_items"] = extract_parallelism
     if schema_naming != SchemaNaming.default:
         dlt.config["schema.naming"] = schema_naming.value
@@ -413,6 +422,7 @@ def ingest(
         # remove the pipelines_dir folder if it was created by ingestr
         if is_pipelines_dir_temp:
             import shutil
             shutil.rmtree(pipelines_dir)
         print(

ingestr/src/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+# ignore secrets, virtual environments and typical python compilation artifacts
+secrets.toml
+# ignore basic python artifacts
+.env
+**/__pycache__/
+**/*.py[cod]
+**/*$py.class
+# ignore duckdb
+*.duckdb
+*.wal

ingestr/src/airtable/__init__.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Source that loads tables form Airtable.
+Supports whitelisting of tables or loading of all tables from a specified base.
+"""
+from typing import Any, Dict, Iterable, Iterator, List, Optional
+import dlt
+import pyairtable
+from dlt.sources import DltResource
+@dlt.source
+def airtable_source(
+    base_id: str = dlt.config.value,
+    table_names: Optional[List[str]] = dlt.config.value,
+    access_token: str = dlt.secrets.value,
+) -> Iterable[DltResource]:
+    """
+    Represents tables for a single Airtable base.
+    Args:
+        base_id (str): The id of the base. Obtain it e.g. from the URL in your webbrowser.
+            It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
+        table_names (Optional[List[str]]): A list of table IDs or table names to load.
+            Unless specified otherwise, all tables in the schema are loaded.
+            Names are freely user-defined. IDs start with "tbl". See https://support.airtable.com/docs/finding-airtable-ids
+        access_token (str): The personal access token.
+            See https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens#personal-access-tokens-basic-actions
+    """
+    api = pyairtable.Api(access_token)
+    all_tables_url = api.build_url(f"meta/bases/{base_id}/tables")
+    tables = api.request(method="GET", url=all_tables_url).get("tables")
+    for t in tables:
+        if table_names:
+            if t.get("id") in table_names or t.get("name") in table_names:
+                yield airtable_resource(api, base_id, t)
+        else:
+            yield airtable_resource(api, base_id, t)
+def airtable_resource(
+    api: pyairtable.Api,
+    base_id: str,
+    table: Dict[str, Any],
+) -> DltResource:
+    """
+    Represents a single airtable.
+    Args:
+        api (pyairtable.Api): The API connection object
+        base_id (str): The id of the base. Obtain it e.g. from the URL in your webbrowser.
+            It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
+        table (Dict[str, Any]): Metadata about an airtable, does not contain the actual records
+    """
+    primary_key_id = table["primaryFieldId"]
+    primary_key_field = [
+        field for field in table["fields"] if field["id"] == primary_key_id
+    ][0]
+    table_name: str = table["name"]
+    primary_key: List[str] = [primary_key_field["name"]]
+    air_table = api.table(base_id, table["id"])
+    # Table.iterate() supports rich customization options, such as chunk size, fields, cell format, timezone, locale, and view
+    air_table_generator: Iterator[List[Any]] = air_table.iterate()
+    return dlt.resource(
+        air_table_generator,
+        name=table_name,
+        primary_key=primary_key,
+        write_disposition="replace",
+    )

ingestr/src/facebook_ads/__init__.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Loads campaigns, ads sets, ads, leads and insight data from Facebook Marketing API"""
+from typing import Iterator, Sequence
+import dlt
+from dlt.common import pendulum
+from dlt.common.typing import TDataItems
+from dlt.sources import DltResource
+from facebook_business.adobjects.ad import Ad
+from .helpers import (
+    execute_job,
+    get_ads_account,
+    get_data_chunked,
+    get_start_date,
+    process_report_item,
+)
+from .settings import (
+    ALL_ACTION_ATTRIBUTION_WINDOWS,
+    ALL_ACTION_BREAKDOWNS,
+    DEFAULT_AD_FIELDS,
+    DEFAULT_ADCREATIVE_FIELDS,
+    DEFAULT_ADSET_FIELDS,
+    DEFAULT_CAMPAIGN_FIELDS,
+    DEFAULT_INSIGHT_FIELDS,
+    DEFAULT_LEAD_FIELDS,
+    INSIGHT_FIELDS_TYPES,
+    INSIGHTS_BREAKDOWNS_OPTIONS,
+    INSIGHTS_PRIMARY_KEY,
+    INVALID_INSIGHTS_FIELDS,
+    TInsightsBreakdownOptions,
+    TInsightsLevels,
+)
+@dlt.source(name="facebook_ads", max_table_nesting=0)
+def facebook_ads_source(
+    account_id: str = dlt.config.value,
+    access_token: str = dlt.secrets.value,
+    chunk_size: int = 50,
+    request_timeout: float = 300.0,
+    app_api_version: str = "v20.0",
+) -> Sequence[DltResource]:
+    """Returns a list of resources to load campaigns, ad sets, ads, creatives and ad leads data from Facebook Marketing API.
+    All the resources have `replace` write disposition by default and define primary keys. Resources are parametrized and allow the user
+    to change the set of fields that will be loaded from the API and the object statuses that will be loaded. See the demonstration script for details.
+    You can convert the source into merge resource to keep the deleted objects. Currently Marketing API does not return deleted objects. See the demo script.
+    We also provide a transformation `enrich_ad_objects` that you can add to any of the resources to get additional data per object via `object.get_api`
+    Args:
+        account_id (str, optional): Account id associated with add manager. See README.md
+        access_token (str, optional): Access token associated with the Business Facebook App. See README.md
+        chunk_size (int, optional): A size of the page and batch request. You may need to decrease it if you request a lot of fields. Defaults to 50.
+        request_timeout (float, optional): Connection timeout. Defaults to 300.0.
+        app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
+    Returns:
+        Sequence[DltResource]: campaigns, ads, ad_sets, ad_creatives, leads
+    """
+    account = get_ads_account(
+        account_id, access_token, request_timeout, app_api_version
+    )
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def campaigns(
+        fields: Sequence[str] = DEFAULT_CAMPAIGN_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_campaigns, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ads(
+        fields: Sequence[str] = DEFAULT_AD_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ads, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ad_sets(
+        fields: Sequence[str] = DEFAULT_ADSET_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ad_sets, fields, states, chunk_size)
+    @dlt.transformer(primary_key="id", write_disposition="replace", selected=True)
+    def leads(
+        items: TDataItems,
+        fields: Sequence[str] = DEFAULT_LEAD_FIELDS,
+        states: Sequence[str] = None,
+    ) -> Iterator[TDataItems]:
+        for item in items:
+            ad = Ad(item["id"])
+            yield get_data_chunked(ad.get_leads, fields, states, chunk_size)
+    @dlt.resource(primary_key="id", write_disposition="replace")
+    def ad_creatives(
+        fields: Sequence[str] = DEFAULT_ADCREATIVE_FIELDS, states: Sequence[str] = None
+    ) -> Iterator[TDataItems]:
+        yield get_data_chunked(account.get_ad_creatives, fields, states, chunk_size)
+    return campaigns, ads, ad_sets, ad_creatives, ads | leads
+@dlt.source(name="facebook_ads", max_table_nesting=0)
+def facebook_insights_source(
+    account_id: str = dlt.config.value,
+    access_token: str = dlt.secrets.value,
+    initial_load_past_days: int = 1,
+    fields: Sequence[str] = DEFAULT_INSIGHT_FIELDS,
+    attribution_window_days_lag: int = 7,
+    time_increment_days: int = 1,
+    breakdowns: TInsightsBreakdownOptions = "ads_insights",
+    action_breakdowns: Sequence[str] = ALL_ACTION_BREAKDOWNS,
+    level: TInsightsLevels = "ad",
+    action_attribution_windows: Sequence[str] = ALL_ACTION_ATTRIBUTION_WINDOWS,
+    batch_size: int = 50,
+    request_timeout: int = 300,
+    app_api_version: str = None,
+) -> DltResource:
+    """Incrementally loads insight reports with defined granularity level, fields, breakdowns etc.
+    By default, the reports are generated one by one for each day, starting with today - attribution_window_days_lag. On subsequent runs, only the reports
+    from the last report date until today are loaded (incremental load). The reports from last 7 days (`attribution_window_days_lag`) are refreshed on each load to
+    account for changes during attribution window.
+    Mind that each report is a job and takes some time to execute.
+    Args:
+        account_id: str = dlt.config.value,
+        access_token: str = dlt.secrets.value,
+        initial_load_past_days (int, optional): How many past days (starting from today) to intially load. Defaults to 30.
+        fields (Sequence[str], optional): A list of fields to include in each reports. Note that `breakdowns` option adds fields automatically. Defaults to DEFAULT_INSIGHT_FIELDS.
+        attribution_window_days_lag (int, optional): Attribution window in days. The reports in attribution window are refreshed on each run.. Defaults to 7.
+        time_increment_days (int, optional): The report aggregation window in days. use 7 for weekly aggregation. Defaults to 1.
+        breakdowns (TInsightsBreakdownOptions, optional): A presents with common aggregations. See settings.py for details. Defaults to "ads_insights_age_and_gender".
+        action_breakdowns (Sequence[str], optional): Action aggregation types. See settings.py for details. Defaults to ALL_ACTION_BREAKDOWNS.
+        level (TInsightsLevels, optional): The granularity level. Defaults to "ad".
+        action_attribution_windows (Sequence[str], optional): Attribution windows for actions. Defaults to ALL_ACTION_ATTRIBUTION_WINDOWS.
+        batch_size (int, optional): Page size when reading data from particular report. Defaults to 50.
+        request_timeout (int, optional): Connection timeout. Defaults to 300.
+        app_api_version(str, optional): A version of the facebook api required by the app for which the access tokens were issued ie. 'v17.0'. Defaults to the facebook_business library default version
+    Returns:
+        DltResource: facebook_insights
+    """
+    account = get_ads_account(
+        account_id, access_token, request_timeout, app_api_version
+    )
+    # we load with a defined lag
+    initial_load_start_date = pendulum.today().subtract(days=initial_load_past_days)
+    initial_load_start_date_str = initial_load_start_date.isoformat()
+    @dlt.resource(
+        primary_key=INSIGHTS_PRIMARY_KEY,
+        write_disposition="merge",
+        columns=INSIGHT_FIELDS_TYPES,
+    )
+    def facebook_insights(
+        date_start: dlt.sources.incremental[str] = dlt.sources.incremental(
+            "date_start", initial_value=initial_load_start_date_str
+        ),
+    ) -> Iterator[TDataItems]:
+        start_date = get_start_date(date_start, attribution_window_days_lag)
+        end_date = pendulum.now()
+        # fetch insights in incremental day steps
+        while start_date <= end_date:
+            query = {
+                "level": level,
+                "action_breakdowns": list(action_breakdowns),
+                "breakdowns": list(
+                    INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["breakdowns"]
+                ),
+                "limit": batch_size,
+                "fields": list(
+                    set(fields)
+                    .union(INSIGHTS_BREAKDOWNS_OPTIONS[breakdowns]["fields"])
+                    .difference(INVALID_INSIGHTS_FIELDS)
+                ),
+                "time_increment": time_increment_days,
+                "action_attribution_windows": list(action_attribution_windows),
+                "time_ranges": [
+                    {
+                        "since": start_date.to_date_string(),
+                        "until": start_date.add(
+                            days=time_increment_days - 1
+                        ).to_date_string(),
+                    }
+                ],
+            }
+            job = execute_job(account.get_insights(params=query, is_async=True))
+            yield list(map(process_report_item, job.get_result()))
+            start_date = start_date.add(days=time_increment_days)
+    return facebook_insights

ingestr/src/facebook_ads/exceptions.py ADDED Viewed

@@ -0,0 +1,5 @@
+from dlt.extract.exceptions import DltResourceException
+class InsightsJobTimeout(DltResourceException):
+    pass

ingestr/src/facebook_ads/helpers.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""Facebook ads source helpers"""
+import functools
+import itertools
+import time
+from typing import Any, Iterator, Sequence
+import dlt
+import humanize
+import pendulum
+from dlt.common import logger
+from dlt.common.configuration.inject import with_config
+from dlt.common.time import ensure_pendulum_datetime
+from dlt.common.typing import DictStrAny, TDataItem, TDataItems
+from dlt.sources.helpers import requests
+from dlt.sources.helpers.requests import Client
+from facebook_business import FacebookAdsApi
+from facebook_business.adobjects.abstractcrudobject import AbstractCrudObject
+from facebook_business.adobjects.abstractobject import AbstractObject
+from facebook_business.adobjects.adaccount import AdAccount
+from facebook_business.adobjects.user import User
+from facebook_business.api import FacebookResponse
+from .exceptions import InsightsJobTimeout
+from .settings import (
+    FACEBOOK_INSIGHTS_RETENTION_PERIOD,
+    INSIGHTS_PRIMARY_KEY,
+    TFbMethod,
+)
+def get_start_date(
+    incremental_start_date: dlt.sources.incremental[str],
+    attribution_window_days_lag: int = 7,
+) -> pendulum.DateTime:
+    """
+    Get the start date for incremental loading of Facebook Insights data.
+    """
+    start_date: pendulum.DateTime = ensure_pendulum_datetime(
+        incremental_start_date.start_value
+    ).subtract(days=attribution_window_days_lag)
+    # facebook forgets insights so trim the lag and warn
+    min_start_date = pendulum.today().subtract(
+        months=FACEBOOK_INSIGHTS_RETENTION_PERIOD
+    )
+    if start_date < min_start_date:
+        logger.warning(
+            "%s: Start date is earlier than %s months ago, using %s instead. "
+            "For more information, see https://www.facebook.com/business/help/1695754927158071?id=354406972049255",
+            "facebook_insights",
+            FACEBOOK_INSIGHTS_RETENTION_PERIOD,
+            min_start_date,
+        )
+        start_date = min_start_date
+        incremental_start_date.start_value = min_start_date
+    # lag the incremental start date by attribution window lag
+    incremental_start_date.start_value = start_date.isoformat()
+    return start_date
+def process_report_item(item: AbstractObject) -> DictStrAny:
+    d: DictStrAny = item.export_all_data()
+    for pki in INSIGHTS_PRIMARY_KEY:
+        if pki not in d:
+            d[pki] = "no_" + pki
+    return d
+def get_data_chunked(
+    method: TFbMethod, fields: Sequence[str], states: Sequence[str], chunk_size: int
+) -> Iterator[TDataItems]:
+    # add pagination and chunk into lists
+    params: DictStrAny = {"limit": chunk_size}
+    if states:
+        params.update({"effective_status": states})
+    it: map[DictStrAny] = map(
+        lambda c: c.export_all_data(), method(fields=fields, params=params)
+    )
+    while True:
+        chunk = list(itertools.islice(it, chunk_size))
+        if not chunk:
+            break
+        yield chunk
+def enrich_ad_objects(fb_obj_type: AbstractObject, fields: Sequence[str]) -> Any:
+    """Returns a transformation that will enrich any of the resources returned by `` with additional fields
+    In example below we add "thumbnail_url" to all objects loaded by `ad_creatives` resource:
+    >>> fb_ads = facebook_ads_source()
+    >>> fb_ads.ad_creatives.add_step(enrich_ad_objects(AdCreative, ["thumbnail_url"]))
+    Internally, the method uses batch API to get data efficiently. Refer to demo script for full examples
+    Args:
+        fb_obj_type (AbstractObject): A Facebook Business object type (Ad, Campaign, AdSet, AdCreative, Lead). Import those types from this module
+        fields (Sequence[str]): A list/tuple of fields to add to each object.
+    Returns:
+        ItemTransformFunctionWithMeta[TDataItems]: A transformation function to be added to a resource with `add_step` method
+    """
+    def _wrap(items: TDataItems, meta: Any = None) -> TDataItems:
+        api_batch = FacebookAdsApi.get_default_api().new_batch()
+        def update_item(resp: FacebookResponse, item: TDataItem) -> None:
+            item.update(resp.json())
+        def fail(resp: FacebookResponse) -> None:
+            raise resp.error()
+        for item in items:
+            o: AbstractCrudObject = fb_obj_type(item["id"])
+            o.api_get(
+                fields=fields,
+                batch=api_batch,
+                success=functools.partial(update_item, item=item),
+                failure=fail,
+            )
+        api_batch.execute()
+        return items
+    return _wrap
+JOB_TIMEOUT_INFO = """This is an intermittent error and may resolve itself on subsequent queries to the Facebook API.
+You should remove the fields in `fields` argument that are not necessary, as that may help improve the reliability of the Facebook API."""
+def execute_job(
+    job: AbstractCrudObject,
+    insights_max_wait_to_start_seconds: int = 5 * 60,
+    insights_max_wait_to_finish_seconds: int = 30 * 60,
+    insights_max_async_sleep_seconds: int = 5 * 60,
+) -> AbstractCrudObject:
+    status: str = None
+    time_start = time.time()
+    sleep_time = 10
+    while status != "Job Completed":
+        duration = time.time() - time_start
+        job = job.api_get()
+        status = job["async_status"]
+        percent_complete = job["async_percent_completion"]
+        job_id = job["id"]
+        logger.info("%s, %d%% done", status, percent_complete)
+        if status == "Job Completed":
+            return job
+        if duration > insights_max_wait_to_start_seconds and percent_complete == 0:
+            pretty_error_message = (
+                "Insights job {} did not start after {} seconds. " + JOB_TIMEOUT_INFO
+            )
+            raise InsightsJobTimeout(
+                "facebook_insights",
+                pretty_error_message.format(job_id, insights_max_wait_to_start_seconds),
+            )
+        elif (
+            duration > insights_max_wait_to_finish_seconds and status != "Job Completed"
+        ):
+            pretty_error_message = (
+                "Insights job {} did not complete after {} seconds. " + JOB_TIMEOUT_INFO
+            )
+            raise InsightsJobTimeout(
+                "facebook_insights",
+                pretty_error_message.format(
+                    job_id, insights_max_wait_to_finish_seconds // 60
+                ),
+            )
+        logger.info("sleeping for %d seconds until job is done", sleep_time)
+        time.sleep(sleep_time)
+        if sleep_time < insights_max_async_sleep_seconds:
+            sleep_time = 2 * sleep_time
+    return job
+def get_ads_account(
+    account_id: str, access_token: str, request_timeout: float, app_api_version: str
+) -> AdAccount:
+    notify_on_token_expiration()
+    def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
+        try:
+            error = response.json()["error"]
+            code = error["code"]
+            message = error["message"]
+            should_retry = code in (
+                1,
+                2,
+                4,
+                17,
+                341,
+                32,
+                613,
+                *range(80000, 80007),
+                800008,
+                800009,
+                80014,
+            )
+            if should_retry:
+                logger.warning(
+                    "facebook_ads source will retry due to %s with error code %i"
+                    % (message, code)
+                )
+            return should_retry
+        except Exception:
+            return False
+    retry_session = Client(
+        request_timeout=request_timeout,
+        raise_for_status=False,
+        retry_condition=retry_on_limit,
+        request_max_attempts=12,
+        request_backoff_factor=2,
+    ).session
+    retry_session.params.update({"access_token": access_token})  # type: ignore
+    # patch dlt requests session with retries
+    API = FacebookAdsApi.init(
+        account_id="act_" + account_id,
+        access_token=access_token,
+        api_version=app_api_version,
+    )
+    API._session.requests = retry_session
+    user = User(fbid="me")
+    accounts = user.get_ad_accounts()
+    account: AdAccount = None
+    for acc in accounts:
+        if acc["account_id"] == account_id:
+            account = acc
+    if not account:
+        raise ValueError("Couldn't find account with id {}".format(account_id))
+    return account
+@with_config(sections=("sources", "facebook_ads"))
+def notify_on_token_expiration(access_token_expires_at: int = None) -> None:
+    """Notifies (currently via logger) if access token expires in less than 7 days. Needs `access_token_expires_at` to be configured."""
+    if not access_token_expires_at:
+        logger.warning(
+            "Token expiration time notification disabled. Configure token expiration timestamp in access_token_expires_at config value"
+        )
+    else:
+        expires_at = pendulum.from_timestamp(access_token_expires_at)
+        if expires_at < pendulum.now().add(days=7):
+            logger.error(
+                f"Access Token expires in {humanize.precisedelta(pendulum.now() - expires_at)}. Replace the token now!"
+            )

ingestr 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl

Potentially problematic release.

ingestr 0.7.6py3-none-any.whl → 0.7.8py3-none-any.whl