PyPI - ingestr - Versions diffs - 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

ingestr 0.7.7py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ingestr/main.py +10 -0
ingestr/src/.gitignore +10 -0
ingestr/src/adjust/_init_.py +31 -0
ingestr/src/adjust/helpers.py +82 -0
ingestr/src/appsflyer/_init_.py +24 -0
ingestr/src/appsflyer/client.py +106 -0
ingestr/src/facebook_ads/__init__.py +197 -0
ingestr/src/facebook_ads/exceptions.py +5 -0
ingestr/src/facebook_ads/helpers.py +255 -0
ingestr/src/facebook_ads/settings.py +208 -0
ingestr/src/factory.py +15 -0
ingestr/src/kafka/__init__.py +103 -0
ingestr/src/kafka/helpers.py +227 -0
ingestr/src/klaviyo/_init_.py +173 -0
ingestr/src/klaviyo/client.py +212 -0
ingestr/src/klaviyo/helpers.py +19 -0
ingestr/src/shopify/__init__.py +1752 -54
ingestr/src/shopify/helpers.py +73 -32
ingestr/src/sources.py +230 -7
ingestr/src/version.py +1 -1
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/METADATA +22 -1
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/RECORD +25 -11
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/WHEEL +0 -0
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/entry_points.txt +0 -0
{ingestr-0.7.7.dist-info → ingestr-0.8.1.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/kafka/helpers.py ADDED Viewed

@@ -0,0 +1,227 @@
+from typing import Any, Dict, List, Optional
+from confluent_kafka import Consumer, Message, TopicPartition  # type: ignore
+from confluent_kafka.admin import TopicMetadata  # type: ignore
+from dlt import config
+from dlt.common import pendulum
+from dlt.common.configuration import configspec
+from dlt.common.configuration.specs import CredentialsConfiguration
+from dlt.common.time import ensure_pendulum_datetime
+from dlt.common.typing import DictStrAny, TSecretValue
+from dlt.common.utils import digest128
+def default_msg_processor(msg: Message) -> Dict[str, Any]:
+    """Basic Kafka message processor.
+    Returns the message value and metadata. Timestamp consists of two values:
+    (type of the timestamp, timestamp). Type represents one of the Python
+    Kafka constants:
+        TIMESTAMP_NOT_AVAILABLE - Timestamps not supported by broker.
+        TIMESTAMP_CREATE_TIME - Message creation time (or source / producer time).
+        TIMESTAMP_LOG_APPEND_TIME - Broker receive time.
+    Args:
+        msg (confluent_kafka.Message): A single Kafka message.
+    Returns:
+        dict: Processed Kafka message.
+    """
+    ts = msg.timestamp()
+    topic = msg.topic()
+    partition = msg.partition()
+    key = msg.key()
+    if key is not None:
+        key = key.decode("utf-8")
+    return {
+        "_kafka": {
+            "partition": partition,
+            "topic": topic,
+            "key": key,
+            "offset": msg.offset(),
+            "ts": {
+                "type": ts[0],
+                "value": ensure_pendulum_datetime(ts[1] / 1e3),
+            },
+            "data": msg.value().decode("utf-8"),
+        },
+        "_kafka_msg_id": digest128(topic + str(partition) + str(key)),
+    }
+class OffsetTracker(dict):  # type: ignore
+    """Object to control offsets of the given topics.
+    Tracks all the partitions of the given topics with two params:
+    current offset and maximum offset (partition length).
+    Args:
+        consumer (confluent_kafka.Consumer): Kafka consumer.
+        topic_names (List): Names of topics to track.
+        pl_state (DictStrAny): Pipeline current state.
+        start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
+            are read. Older messages are ignored.
+    """
+    def __init__(
+        self,
+        consumer: Consumer,
+        topic_names: List[str],
+        pl_state: DictStrAny,
+        start_from: pendulum.DateTime = None,  # type: ignore
+    ):
+        super().__init__()
+        self._consumer = consumer
+        self._topics = self._read_topics(topic_names)
+        # read/init current offsets
+        self._cur_offsets = pl_state.setdefault(
+            "offsets", {t_name: {} for t_name in topic_names}
+        )
+        self._init_partition_offsets(start_from)
+    def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
+        """Read the given topics metadata from Kafka.
+        Reads all the topics at once, instead of requesting
+        each in a separate call. Returns only those needed.
+        Args:
+            topic_names (list): Names of topics to be read.
+        Returns:
+            dict: Metadata of the given topics.
+        """
+        tracked_topics = {}
+        topics = self._consumer.list_topics().topics
+        for t_name in topic_names:
+            tracked_topics[t_name] = topics[t_name]
+        return tracked_topics
+    def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
+        """Designate current and maximum offsets for every partition.
+        Current offsets are read from the state, if present. Set equal
+        to the partition beginning otherwise.
+        Args:
+            start_from (pendulum.DateTime): A timestamp, at which to start
+                reading. Older messages are ignored.
+        """
+        all_parts = []
+        for t_name, topic in self._topics.items():
+            self[t_name] = {}
+            # init all the topic partitions from the partitions' metadata
+            parts = [
+                TopicPartition(
+                    t_name,
+                    part,
+                    start_from.int_timestamp * 1000 if start_from is not None else 0,
+                )
+                for part in topic.partitions
+            ]
+            # get offsets for the timestamp, if given
+            if start_from is not None:
+                ts_offsets = self._consumer.offsets_for_times(parts)
+            # designate current and maximum offsets for every partition
+            for i, part in enumerate(parts):
+                max_offset = self._consumer.get_watermark_offsets(part)[1]
+                if start_from is not None:
+                    if ts_offsets[i].offset != -1:
+                        cur_offset = ts_offsets[i].offset
+                    else:
+                        cur_offset = max_offset - 1
+                else:
+                    cur_offset = (
+                        self._cur_offsets[t_name].get(str(part.partition), -1) + 1
+                    )
+                self[t_name][str(part.partition)] = {
+                    "cur": cur_offset,
+                    "max": max_offset,
+                }
+                parts[i].offset = cur_offset
+            all_parts += parts
+        # assign the current offsets to the consumer
+        self._consumer.assign(all_parts)
+    @property
+    def has_unread(self) -> bool:
+        """Check if there are unread messages in the tracked topics.
+        Returns:
+            bool: True, if there are messages to read, False if all
+                the current offsets are equal to their maximums.
+        """
+        for parts in self.values():
+            for part in parts.values():
+                if part["cur"] + 1 < part["max"]:
+                    return True
+        return False
+    def renew(self, msg: Message) -> None:
+        """Update partition offset from the given message.
+        Args:
+            msg (confluent_kafka.Message): A read Kafka message.
+        """
+        topic = msg.topic()
+        partition = str(msg.partition())
+        offset = self[topic][partition]
+        offset["cur"] = msg.offset()
+        self._cur_offsets[topic][partition] = msg.offset()
+@configspec
+class KafkaCredentials(CredentialsConfiguration):
+    """Kafka source credentials.
+    NOTE: original Kafka credentials are written with a period, e.g.
+    bootstrap.servers. However, KafkaCredentials expect them to
+    use underscore symbols instead, e.g. bootstrap_servers.
+    """
+    bootstrap_servers: str = config.value
+    group_id: str = config.value
+    security_protocol: Optional[str] = None
+    sasl_mechanisms: Optional[str] = None
+    sasl_username: Optional[str] = None
+    sasl_password: Optional[TSecretValue] = None
+    def init_consumer(self) -> Consumer:
+        """Init a Kafka consumer from this credentials.
+        Returns:
+            confluent_kafka.Consumer: an initiated consumer.
+        """
+        config = {
+            "bootstrap.servers": self.bootstrap_servers,
+            "group.id": self.group_id,
+            "auto.offset.reset": "earliest",
+        }
+        if self.security_protocol:
+            config["security.protocol"] = self.security_protocol
+        if self.sasl_mechanisms:
+            config["sasl.mechanisms"] = self.sasl_mechanisms
+        if self.sasl_username:
+            config["sasl.username"] = self.sasl_username
+        if self.sasl_password:
+            config["sasl.password"] = self.sasl_password
+        return Consumer(config)

ingestr/src/klaviyo/_init_.py ADDED Viewed

@@ -0,0 +1,173 @@
+from typing import Iterable
+import dlt
+import pendulum
+import requests
+from dlt.common.time import ensure_pendulum_datetime
+from dlt.common.typing import TAnyDateTime, TDataItem
+from dlt.sources import DltResource
+from dlt.sources.helpers.requests import Client
+from ingestr.src.klaviyo.client import KlaviyoClient
+from ingestr.src.klaviyo.helpers import split_date_range
+def retry_on_limit(response: requests.Response, exception: BaseException) -> bool:
+    return response.status_code == 429
+def create_client() -> requests.Session:
+    return Client(
+        request_timeout=10.0,
+        raise_for_status=False,
+        retry_condition=retry_on_limit,
+        request_max_attempts=12,
+        request_backoff_factor=2,
+    ).session
+@dlt.source(max_table_nesting=0)
+def klaviyo_source(api_key: str, start_date: TAnyDateTime) -> Iterable[DltResource]:
+    start_date_obj = ensure_pendulum_datetime(start_date)
+    client = KlaviyoClient(api_key)
+    @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
+    def events(
+        datetime=dlt.sources.incremental("datetime", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(datetime.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_events(create_client(), s, e)
+    @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
+    def profiles(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_profiles(create_client(), s, e)
+    @dlt.resource(write_disposition="merge", primary_key="id", parallelized=True)
+    def campaigns(
+        updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated_at.start_value), pendulum.now()
+        )
+        for campaign_type in ["email", "sms"]:
+            for start, end in intervals:
+                yield lambda s=start, e=end, ct=campaign_type: client.fetch_campaigns(
+                    create_client(), s, e, ct
+                )
+    @dlt.resource(write_disposition="merge", primary_key="id")
+    def metrics(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_metrics(create_client(), updated.start_value)
+    @dlt.resource(write_disposition="replace", primary_key="id")
+    def tags() -> Iterable[TAnyDateTime]:
+        yield from client.fetch_tag(create_client())
+    @dlt.resource(write_disposition="replace", primary_key="id")
+    def coupons() -> Iterable[TAnyDateTime]:
+        yield from client.fetch_coupons(create_client())
+    @dlt.resource(write_disposition="merge", primary_key="id", name="catalog-variants")
+    def catalog_variants(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_catalog_variant(create_client(), updated.start_value)
+    @dlt.resource(
+        write_disposition="merge", primary_key="id", name="catalog-categories"
+    )
+    def catalog_categories(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_catalog_categories(create_client(), updated.start_value)
+    @dlt.resource(write_disposition="merge", primary_key="id", name="catalog-items")
+    def catalog_items(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_catalog_item(create_client(), updated.start_value)
+    @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
+    def forms(
+        updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated_at.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_forms(create_client(), s, e)
+    @dlt.resource(write_disposition="merge", primary_key="id")
+    def lists(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_lists(create_client(), updated.start_value)
+    @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
+    def images(
+        updated_at=dlt.sources.incremental("updated_at", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated_at.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_images(create_client(), s, e)
+    @dlt.resource(write_disposition="merge", primary_key="id")
+    def segments(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        yield from client.fetch_segments(create_client(), updated.start_value)
+    @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
+    def flows(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_flows(create_client(), s, e)
+    @dlt.resource(write_disposition="append", primary_key="id", parallelized=True)
+    def templates(
+        updated=dlt.sources.incremental("updated", start_date_obj.isoformat()),
+    ) -> Iterable[TDataItem]:
+        intervals = split_date_range(
+            pendulum.parse(updated.start_value), pendulum.now()
+        )
+        for start, end in intervals:
+            yield lambda s=start, e=end: client.fetch_templates(create_client(), s, e)
+    return (
+        events,
+        profiles,
+        campaigns,
+        metrics,
+        tags,
+        coupons,
+        catalog_variants,
+        catalog_categories,
+        catalog_items,
+        forms,
+        lists,
+        images,
+        segments,
+        flows,
+        templates,
+    )

ingestr/src/klaviyo/client.py ADDED Viewed

@@ -0,0 +1,212 @@
+from urllib.parse import urlencode
+import pendulum
+import requests
+BASE_URL = "https://a.klaviyo.com/api"
+class KlaviyoClient:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    def __get_headers(self):
+        return {
+            "Authorization": f"Klaviyo-API-Key {self.api_key}",
+            "accept": "application/json",
+            "revision": "2024-07-15",
+        }
+    def _flatten_attributes(self, items: list):
+        for event in items:
+            if "attributes" not in event:
+                continue
+            for attribute_key in event["attributes"]:
+                event[attribute_key] = event["attributes"][attribute_key]
+            del event["attributes"]
+        return items
+    def _fetch_pages(
+        self, session: requests.Session, url: str, flat: bool = True
+    ) -> list:
+        all_items = []
+        while True:
+            response = session.get(url=url, headers=self.__get_headers())
+            result = response.json()
+            items = result.get("data", [])
+            if flat:
+                items = self._flatten_attributes(items)
+            all_items.extend(items)
+            nextURL = result.get("links", {}).get("next")
+            if nextURL is None:
+                break
+            url = nextURL
+        return all_items
+    def fetch_events(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+    ):
+        print(f"Fetching events for {start_date} to {end_date}")
+        url = f"{BASE_URL}/events/?sort=-datetime&filter=and(greater-or-equal(datetime,{start_date}),less-than(datetime,{end_date}))"
+        return self._fetch_pages(session, url)
+    def fetch_metrics(
+        self,
+        session: requests.Session,
+        last_updated: str,
+    ):
+        print(f"Fetching metrics since {last_updated}")
+        url = f"{BASE_URL}/metrics"
+        items = self._fetch_pages(session, url)
+        last_updated_obj = pendulum.parse(last_updated)
+        for item in items:
+            updated_at = pendulum.parse(item["updated"])
+            if updated_at > last_updated_obj:
+                yield item
+    def fetch_profiles(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+    ):
+        pendulum_start_date = pendulum.parse(start_date)
+        pendulum_start_date = pendulum_start_date.subtract(seconds=1)
+        url = f"{BASE_URL}/profiles/?sort=updated&filter=and(greater-than(updated,{pendulum_start_date.isoformat()}),less-than(updated,{end_date}))"
+        return self._fetch_pages(session, url)
+    def fetch_campaigns(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+        campaign_type: str,
+    ):
+        print(f"Fetching {campaign_type} campaigns for {start_date} to {end_date}")
+        base_url = f"{BASE_URL}/campaigns/"
+        params = {
+            "sort": "updated_at",
+            "filter": f"and(equals(messages.channel,'{campaign_type}'),greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))",
+        }
+        url = f"{base_url}?{urlencode(params)}"
+        pages = self._fetch_pages(session, url)
+        for page in pages:
+            page["campaign_type"] = campaign_type
+        return pages
+    def fetch_tag(self, session: requests.Session):
+        url = f"{BASE_URL}/tags"
+        return self._fetch_pages(session, url, False)
+    def fetch_catalog_variant(
+        self,
+        session: requests.Session,
+        last_updated: str,
+    ):
+        url = f"{BASE_URL}/catalog-variants"
+        items = self._fetch_pages(session, url)
+        last_updated_obj = pendulum.parse(last_updated)
+        for item in items:
+            updated_at = pendulum.parse(item["updated"])
+            if updated_at > last_updated_obj:
+                yield item
+    def fetch_coupons(self, session: requests.Session):
+        url = f"{BASE_URL}/coupons"
+        return self._fetch_pages(session, url, False)
+    def fetch_catalog_categories(
+        self,
+        session: requests.Session,
+        last_updated: str,
+    ):
+        url = f"{BASE_URL}/catalog-categories"
+        items = self._fetch_pages(session, url)
+        last_updated_obj = pendulum.parse(last_updated)
+        for item in items:
+            updated_at = pendulum.parse(item["updated"])
+            if updated_at > last_updated_obj:
+                yield item
+    def fetch_catalog_item(
+        self,
+        session: requests.Session,
+        last_updated: str,
+    ):
+        url = f"{BASE_URL}/catalog-items"
+        items = self._fetch_pages(session, url)
+        last_updated_obj = pendulum.parse(last_updated)
+        for item in items:
+            updated_at = pendulum.parse(item["updated"])
+            if updated_at > last_updated_obj:
+                yield item
+    def fetch_forms(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+    ):
+        print(f"Fetching forms for {start_date} to {end_date}")
+        url = f"{BASE_URL}/forms/?sort=-updated_at&filter=and(greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))"
+        return self._fetch_pages(session, url)
+    def fetch_lists(
+        self,
+        session: requests.Session,
+        updated_date: str,
+    ):
+        # https://a.klaviyo.com/api/lists/?sort=-updated&filter=greater-than(updated,2024-02-01 00:00:00+00:00)
+        url = f"{BASE_URL}/lists/?sort=-updated&filter=greater-than(updated,{updated_date})"
+        return self._fetch_pages(session, url)
+    def fetch_images(self, session: requests.Session, start_date: str, end_date: str):
+        # https://a.klaviyo.com/api/images/?sort=-updated_at&filter=greater-or-equal(updated_at,2024-06-01 00:00:00+00:00),less-than(updated_at,2024-09-01 00:00:00+00:00)
+        url = f"{BASE_URL}/images/?sort=-updated_at&filter=and(greater-or-equal(updated_at,{start_date}),less-than(updated_at,{end_date}))"
+        return self._fetch_pages(session, url)
+    def fetch_segments(
+        self,
+        session: requests.Session,
+        updated_date: str,
+    ):
+        # https://a.klaviyo.com/api/segments/?sort=-updated&filter=greater-than(updated,2024-04-01 00:00:00+00:00)
+        url = f"{BASE_URL}/segments/?sort=-updated&filter=greater-than(updated,{updated_date})"
+        print("url", url)
+        return self._fetch_pages(session, url)
+    def fetch_flows(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+    ):
+        print(f"Fetching events for {start_date} to {end_date}")
+        # https://a.klaviyo.com/api/flows/?sort=-updated&filter=and(greater-or-equal(updated,2024-06-01 00:00:00+00:00),less-than(updated,2024-09-01 00:00:00+00:00))
+        url = f"{BASE_URL}/flows/?sort=-updated&filter=and(greater-or-equal(updated,{start_date}),less-than(updated,{end_date}))"
+        return self._fetch_pages(session, url)
+    def fetch_templates(
+        self,
+        session: requests.Session,
+        start_date: str,
+        end_date: str,
+    ):
+        # https://a.klaviyo.com/api/templates/?sort=-updated&filter=and(greater-or-equal(updated,2024-06-01 00:00:00+00:00),less-than(updated,2024-09-01 00:00:00+00:00))
+        url = f"{BASE_URL}/templates/?sort=-updated&filter=and(greater-or-equal(updated,{start_date}),less-than(updated,{end_date}))"
+        return self._fetch_pages(session, url)

ingestr/src/klaviyo/helpers.py ADDED Viewed

@@ -0,0 +1,19 @@
+from typing import List
+import pendulum
+def split_date_range(
+    start_date: pendulum.DateTime, end_date: pendulum.DateTime
+) -> List[tuple]:
+    interval = "days"
+    if (end_date - start_date).days <= 1:
+        interval = "hours"
+    intervals = []
+    current = start_date
+    while current < end_date:
+        next_date = min(current.add(**{interval: 1}), end_date)
+        intervals.append((current.isoformat(), next_date.isoformat()))
+        current = next_date
+    return intervals

ingestr 0.7.7__py3-none-any.whl → 0.8.1__py3-none-any.whl

ingestr 0.7.7py3-none-any.whl → 0.8.1py3-none-any.whl