ingestr 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -0,0 +1,208 @@
1
+ """Facebook ads source settings and constants"""
2
+
3
+ from typing import Any, Callable, Dict, Iterator, Literal
4
+
5
+ from dlt.common.schema.typing import TTableSchemaColumns
6
+ from facebook_business.adobjects.abstractobject import AbstractObject
7
+
8
+ TFbMethod = Callable[..., Iterator[AbstractObject]]
9
+
10
+
11
+ DEFAULT_FIELDS = (
12
+ "id",
13
+ "updated_time",
14
+ "created_time",
15
+ "name",
16
+ "status",
17
+ "effective_status",
18
+ )
19
+
20
+ DEFAULT_CAMPAIGN_FIELDS = DEFAULT_FIELDS + (
21
+ "objective",
22
+ "start_time",
23
+ "stop_time",
24
+ "daily_budget",
25
+ "lifetime_budget",
26
+ )
27
+
28
+ DEFAULT_AD_FIELDS = DEFAULT_FIELDS + (
29
+ "adset_id",
30
+ "campaign_id",
31
+ "creative",
32
+ "targeting",
33
+ "tracking_specs",
34
+ "conversion_specs",
35
+ )
36
+
37
+ DEFAULT_ADSET_FIELDS = DEFAULT_FIELDS + (
38
+ "campaign_id",
39
+ "start_time",
40
+ "end_time",
41
+ "daily_budget",
42
+ "lifetime_budget",
43
+ "optimization_goal",
44
+ "promoted_object",
45
+ "billing_event",
46
+ "bid_amount",
47
+ "bid_strategy",
48
+ "targeting",
49
+ )
50
+
51
+ DEFAULT_ADCREATIVE_FIELDS = (
52
+ "id",
53
+ "name",
54
+ "status",
55
+ "thumbnail_url",
56
+ "object_story_spec",
57
+ "effective_object_story_id",
58
+ "call_to_action_type",
59
+ "object_type",
60
+ "template_url",
61
+ "url_tags",
62
+ "instagram_actor_id",
63
+ "product_set_id",
64
+ )
65
+
66
+ DEFAULT_LEAD_FIELDS = (
67
+ "id",
68
+ "created_time",
69
+ "ad_id",
70
+ "ad_name",
71
+ "adset_id",
72
+ "adset_name",
73
+ "campaign_id",
74
+ "campaign_name",
75
+ "form_id",
76
+ "field_data",
77
+ )
78
+
79
+ DEFAULT_INSIGHT_FIELDS = (
80
+ "campaign_id",
81
+ "adset_id",
82
+ "ad_id",
83
+ "date_start",
84
+ "date_stop",
85
+ "reach",
86
+ "impressions",
87
+ "frequency",
88
+ "clicks",
89
+ "unique_clicks",
90
+ "ctr",
91
+ "unique_ctr",
92
+ "cpc",
93
+ "cpm",
94
+ "cpp",
95
+ "spend",
96
+ "actions",
97
+ "action_values",
98
+ "cost_per_action_type",
99
+ "website_ctr",
100
+ "account_currency",
101
+ "ad_click_actions",
102
+ "ad_name",
103
+ "adset_name",
104
+ "campaign_name",
105
+ "country",
106
+ "dma",
107
+ "full_view_impressions",
108
+ "full_view_reach",
109
+ "inline_link_click_ctr",
110
+ "outbound_clicks",
111
+ "reach",
112
+ "social_spend",
113
+ "spend",
114
+ "website_ctr",
115
+ )
116
+
117
+ TInsightsLevels = Literal["account", "campaign", "adset", "ad"]
118
+
119
+ INSIGHTS_PRIMARY_KEY = ("campaign_id", "adset_id", "ad_id", "date_start")
120
+
121
+ ALL_STATES = {
122
+ "effective_status": [
123
+ "ACTIVE",
124
+ "PAUSED",
125
+ "DELETED",
126
+ "PENDING_REVIEW",
127
+ "DISAPPROVED",
128
+ "PREAPPROVED",
129
+ "PENDING_BILLING_INFO",
130
+ "CAMPAIGN_PAUSED",
131
+ "ARCHIVED",
132
+ "ADSET_PAUSED",
133
+ ]
134
+ }
135
+
136
+ TInsightsBreakdownOptions = Literal[
137
+ "ads_insights",
138
+ "ads_insights_age_and_gender",
139
+ "ads_insights_country",
140
+ "ads_insights_platform_and_device",
141
+ "ads_insights_region",
142
+ "ads_insights_dma",
143
+ "ads_insights_hourly_advertiser",
144
+ ]
145
+
146
+ ALL_ACTION_ATTRIBUTION_WINDOWS = (
147
+ "1d_click",
148
+ "7d_click",
149
+ "28d_click",
150
+ "1d_view",
151
+ "7d_view",
152
+ "28d_view",
153
+ )
154
+
155
+ ALL_ACTION_BREAKDOWNS = ("action_type", "action_target_id", "action_destination")
156
+
157
+ INSIGHTS_BREAKDOWNS_OPTIONS: Dict[TInsightsBreakdownOptions, Any] = {
158
+ "ads_insights": {"breakdowns": (), "fields": ()},
159
+ "ads_insights_age_and_gender": {
160
+ "breakdowns": ("age", "gender"),
161
+ "fields": ("age", "gender"),
162
+ },
163
+ "ads_insights_country": {"breakdowns": ("country",), "fields": ("country",)},
164
+ "ads_insights_platform_and_device": {
165
+ "breakdowns": ("publisher_platform", "platform_position", "impression_device"),
166
+ "fields": ("publisher_platform", "platform_position", "impression_device"),
167
+ },
168
+ "ads_insights_region": {"breakdowns": ("region",), "fields": ("region",)},
169
+ "ads_insights_dma": {"breakdowns": ("dma",), "fields": ("dma",)},
170
+ "ads_insights_hourly_advertiser": {
171
+ "breakdowns": ("hourly_stats_aggregated_by_advertiser_time_zone",),
172
+ "fields": ("hourly_stats_aggregated_by_advertiser_time_zone",),
173
+ },
174
+ }
175
+
176
+ INSIGHT_FIELDS_TYPES: TTableSchemaColumns = {
177
+ "campaign_id": {"data_type": "bigint"},
178
+ "adset_id": {"data_type": "bigint"},
179
+ "ad_id": {"data_type": "bigint"},
180
+ "date_start": {"data_type": "timestamp"},
181
+ "date_stop": {"data_type": "timestamp"},
182
+ "reach": {"data_type": "bigint"},
183
+ "impressions": {"data_type": "bigint"},
184
+ "frequency": {"data_type": "decimal"},
185
+ "clicks": {"data_type": "bigint"},
186
+ "unique_clicks": {"data_type": "bigint"},
187
+ "ctr": {"data_type": "decimal"},
188
+ "unique_ctr": {"data_type": "decimal"},
189
+ "cpc": {"data_type": "decimal"},
190
+ "cpm": {"data_type": "decimal"},
191
+ "cpp": {"data_type": "decimal"},
192
+ "spend": {"data_type": "decimal"},
193
+ }
194
+
195
+ INVALID_INSIGHTS_FIELDS = [
196
+ "impression_device",
197
+ "publisher_platform",
198
+ "platform_position",
199
+ "age",
200
+ "gender",
201
+ "country",
202
+ "placement",
203
+ "region",
204
+ "dma",
205
+ "hourly_stats_aggregated_by_advertiser_time_zone",
206
+ ]
207
+
208
+ FACEBOOK_INSIGHTS_RETENTION_PERIOD = 37 # months
ingestr/src/factory.py CHANGED
@@ -15,14 +15,19 @@ from ingestr.src.destinations import (
15
15
  SynapseDestination,
16
16
  )
17
17
  from ingestr.src.sources import (
18
+ AirtableSource,
18
19
  ChessSource,
20
+ FacebookAdsSource,
19
21
  GoogleSheetsSource,
20
22
  GorgiasSource,
21
23
  HubspotSource,
24
+ KafkaSource,
25
+ KlaviyoSource,
22
26
  LocalCsvSource,
23
27
  MongoDbSource,
24
28
  NotionSource,
25
29
  ShopifySource,
30
+ SlackSource,
26
31
  SqlSource,
27
32
  StripeAnalyticsSource,
28
33
  )
@@ -109,8 +114,18 @@ class SourceDestinationFactory:
109
114
  return ChessSource()
110
115
  elif self.source_scheme == "stripe":
111
116
  return StripeAnalyticsSource()
117
+ elif self.source_scheme == "facebookads":
118
+ return FacebookAdsSource()
119
+ elif self.source_scheme == "slack":
120
+ return SlackSource()
112
121
  elif self.source_scheme == "hubspot":
113
122
  return HubspotSource()
123
+ elif self.source_scheme == "airtable":
124
+ return AirtableSource()
125
+ elif self.source_scheme == "klaviyo":
126
+ return KlaviyoSource()
127
+ elif self.source_scheme == "kafka":
128
+ return KafkaSource()
114
129
  else:
115
130
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
116
131
 
@@ -0,0 +1,103 @@
1
+ """A source to extract Kafka messages.
2
+
3
+ When extraction starts, partitions length is checked -
4
+ data is read only up to it, overriding the default Kafka's
5
+ behavior of waiting for new messages in endless loop.
6
+ """
7
+
8
+ from contextlib import closing
9
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
10
+
11
+ import dlt
12
+ from confluent_kafka import Consumer, Message # type: ignore
13
+ from dlt.common import logger
14
+ from dlt.common.time import ensure_pendulum_datetime
15
+ from dlt.common.typing import TAnyDateTime, TDataItem
16
+
17
+ from .helpers import (
18
+ KafkaCredentials,
19
+ OffsetTracker,
20
+ default_msg_processor,
21
+ )
22
+
23
+
24
+ @dlt.resource(
25
+ name="kafka_messages",
26
+ table_name=lambda msg: msg["_kafka"]["topic"],
27
+ standalone=True,
28
+ )
29
+ def kafka_consumer(
30
+ topics: Union[str, List[str]],
31
+ credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value,
32
+ msg_processor: Optional[
33
+ Callable[[Message], Dict[str, Any]]
34
+ ] = default_msg_processor,
35
+ batch_size: Optional[int] = 3000,
36
+ batch_timeout: Optional[int] = 3,
37
+ start_from: Optional[TAnyDateTime] = None,
38
+ ) -> Iterable[TDataItem]:
39
+ """Extract recent messages from the given Kafka topics.
40
+
41
+ The resource tracks offsets for all the topics and partitions,
42
+ and so reads data incrementally.
43
+
44
+ Messages from different topics are saved in different tables.
45
+
46
+ Args:
47
+ topics (Union[str, List[str]]): Names of topics to extract.
48
+ credentials (Optional[Union[KafkaCredentials, Consumer]]):
49
+ Auth credentials or an initiated Kafka consumer. By default,
50
+ is taken from secrets.
51
+ msg_processor(Optional[Callable]): A function-converter,
52
+ which'll process every Kafka message after it's read and
53
+ before it's transfered to the destination.
54
+ batch_size (Optional[int]): Messages batch size to read at once.
55
+ batch_timeout (Optional[int]): Maximum time to wait for a batch
56
+ consume, in seconds.
57
+ start_from (Optional[TAnyDateTime]): A timestamp, at which to start
58
+ reading. Older messages are ignored.
59
+
60
+ Yields:
61
+ Iterable[TDataItem]: Kafka messages.
62
+ """
63
+ if not isinstance(topics, list):
64
+ topics = [topics]
65
+
66
+ if isinstance(credentials, Consumer):
67
+ consumer = credentials
68
+ elif isinstance(credentials, KafkaCredentials):
69
+ consumer = credentials.init_consumer()
70
+ else:
71
+ raise TypeError(
72
+ (
73
+ "Wrong credentials type provided. Need to be of type: "
74
+ "KafkaCredentials or confluent_kafka.Consumer"
75
+ )
76
+ )
77
+
78
+ if start_from is not None:
79
+ start_from = ensure_pendulum_datetime(start_from)
80
+
81
+ tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from) # type: ignore
82
+
83
+ # read messages up to the maximum offsets,
84
+ # not waiting for new messages
85
+ with closing(consumer):
86
+ while tracker.has_unread:
87
+ messages = consumer.consume(batch_size, timeout=batch_timeout)
88
+ if not messages:
89
+ break
90
+
91
+ batch = []
92
+ for msg in messages:
93
+ if msg.error():
94
+ err = msg.error()
95
+ if err.retriable() or not err.fatal():
96
+ logger.warning(f"ERROR: {err} - RETRYING")
97
+ else:
98
+ raise err
99
+ else:
100
+ batch.append(msg_processor(msg)) # type: ignore
101
+ tracker.renew(msg)
102
+
103
+ yield batch
@@ -0,0 +1,227 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from confluent_kafka import Consumer, Message, TopicPartition # type: ignore
4
+ from confluent_kafka.admin import TopicMetadata # type: ignore
5
+ from dlt import config
6
+ from dlt.common import pendulum
7
+ from dlt.common.configuration import configspec
8
+ from dlt.common.configuration.specs import CredentialsConfiguration
9
+ from dlt.common.time import ensure_pendulum_datetime
10
+ from dlt.common.typing import DictStrAny, TSecretValue
11
+ from dlt.common.utils import digest128
12
+
13
+
14
+ def default_msg_processor(msg: Message) -> Dict[str, Any]:
15
+ """Basic Kafka message processor.
16
+
17
+ Returns the message value and metadata. Timestamp consists of two values:
18
+ (type of the timestamp, timestamp). Type represents one of the Python
19
+ Kafka constants:
20
+ TIMESTAMP_NOT_AVAILABLE - Timestamps not supported by broker.
21
+ TIMESTAMP_CREATE_TIME - Message creation time (or source / producer time).
22
+ TIMESTAMP_LOG_APPEND_TIME - Broker receive time.
23
+
24
+ Args:
25
+ msg (confluent_kafka.Message): A single Kafka message.
26
+
27
+ Returns:
28
+ dict: Processed Kafka message.
29
+ """
30
+ ts = msg.timestamp()
31
+ topic = msg.topic()
32
+ partition = msg.partition()
33
+ key = msg.key()
34
+ if key is not None:
35
+ key = key.decode("utf-8")
36
+
37
+ return {
38
+ "_kafka": {
39
+ "partition": partition,
40
+ "topic": topic,
41
+ "key": key,
42
+ "offset": msg.offset(),
43
+ "ts": {
44
+ "type": ts[0],
45
+ "value": ensure_pendulum_datetime(ts[1] / 1e3),
46
+ },
47
+ "data": msg.value().decode("utf-8"),
48
+ },
49
+ "_kafka_msg_id": digest128(topic + str(partition) + str(key)),
50
+ }
51
+
52
+
53
+ class OffsetTracker(dict): # type: ignore
54
+ """Object to control offsets of the given topics.
55
+
56
+ Tracks all the partitions of the given topics with two params:
57
+ current offset and maximum offset (partition length).
58
+
59
+ Args:
60
+ consumer (confluent_kafka.Consumer): Kafka consumer.
61
+ topic_names (List): Names of topics to track.
62
+ pl_state (DictStrAny): Pipeline current state.
63
+ start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
64
+ are read. Older messages are ignored.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ consumer: Consumer,
70
+ topic_names: List[str],
71
+ pl_state: DictStrAny,
72
+ start_from: pendulum.DateTime = None, # type: ignore
73
+ ):
74
+ super().__init__()
75
+
76
+ self._consumer = consumer
77
+ self._topics = self._read_topics(topic_names)
78
+
79
+ # read/init current offsets
80
+ self._cur_offsets = pl_state.setdefault(
81
+ "offsets", {t_name: {} for t_name in topic_names}
82
+ )
83
+
84
+ self._init_partition_offsets(start_from)
85
+
86
+ def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
87
+ """Read the given topics metadata from Kafka.
88
+
89
+ Reads all the topics at once, instead of requesting
90
+ each in a separate call. Returns only those needed.
91
+
92
+ Args:
93
+ topic_names (list): Names of topics to be read.
94
+
95
+ Returns:
96
+ dict: Metadata of the given topics.
97
+ """
98
+ tracked_topics = {}
99
+ topics = self._consumer.list_topics().topics
100
+
101
+ for t_name in topic_names:
102
+ tracked_topics[t_name] = topics[t_name]
103
+
104
+ return tracked_topics
105
+
106
+ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
107
+ """Designate current and maximum offsets for every partition.
108
+
109
+ Current offsets are read from the state, if present. Set equal
110
+ to the partition beginning otherwise.
111
+
112
+ Args:
113
+ start_from (pendulum.DateTime): A timestamp, at which to start
114
+ reading. Older messages are ignored.
115
+ """
116
+ all_parts = []
117
+ for t_name, topic in self._topics.items():
118
+ self[t_name] = {}
119
+
120
+ # init all the topic partitions from the partitions' metadata
121
+ parts = [
122
+ TopicPartition(
123
+ t_name,
124
+ part,
125
+ start_from.int_timestamp * 1000 if start_from is not None else 0,
126
+ )
127
+ for part in topic.partitions
128
+ ]
129
+
130
+ # get offsets for the timestamp, if given
131
+ if start_from is not None:
132
+ ts_offsets = self._consumer.offsets_for_times(parts)
133
+
134
+ # designate current and maximum offsets for every partition
135
+ for i, part in enumerate(parts):
136
+ max_offset = self._consumer.get_watermark_offsets(part)[1]
137
+
138
+ if start_from is not None:
139
+ if ts_offsets[i].offset != -1:
140
+ cur_offset = ts_offsets[i].offset
141
+ else:
142
+ cur_offset = max_offset - 1
143
+ else:
144
+ cur_offset = (
145
+ self._cur_offsets[t_name].get(str(part.partition), -1) + 1
146
+ )
147
+
148
+ self[t_name][str(part.partition)] = {
149
+ "cur": cur_offset,
150
+ "max": max_offset,
151
+ }
152
+
153
+ parts[i].offset = cur_offset
154
+
155
+ all_parts += parts
156
+
157
+ # assign the current offsets to the consumer
158
+ self._consumer.assign(all_parts)
159
+
160
+ @property
161
+ def has_unread(self) -> bool:
162
+ """Check if there are unread messages in the tracked topics.
163
+
164
+ Returns:
165
+ bool: True, if there are messages to read, False if all
166
+ the current offsets are equal to their maximums.
167
+ """
168
+ for parts in self.values():
169
+ for part in parts.values():
170
+ if part["cur"] + 1 < part["max"]:
171
+ return True
172
+
173
+ return False
174
+
175
+ def renew(self, msg: Message) -> None:
176
+ """Update partition offset from the given message.
177
+
178
+ Args:
179
+ msg (confluent_kafka.Message): A read Kafka message.
180
+ """
181
+ topic = msg.topic()
182
+ partition = str(msg.partition())
183
+
184
+ offset = self[topic][partition]
185
+ offset["cur"] = msg.offset()
186
+
187
+ self._cur_offsets[topic][partition] = msg.offset()
188
+
189
+
190
+ @configspec
191
+ class KafkaCredentials(CredentialsConfiguration):
192
+ """Kafka source credentials.
193
+
194
+ NOTE: original Kafka credentials are written with a period, e.g.
195
+ bootstrap.servers. However, KafkaCredentials expect them to
196
+ use underscore symbols instead, e.g. bootstrap_servers.
197
+ """
198
+
199
+ bootstrap_servers: str = config.value
200
+ group_id: str = config.value
201
+ security_protocol: Optional[str] = None
202
+ sasl_mechanisms: Optional[str] = None
203
+ sasl_username: Optional[str] = None
204
+ sasl_password: Optional[TSecretValue] = None
205
+
206
+ def init_consumer(self) -> Consumer:
207
+ """Init a Kafka consumer from this credentials.
208
+
209
+ Returns:
210
+ confluent_kafka.Consumer: an initiated consumer.
211
+ """
212
+ config = {
213
+ "bootstrap.servers": self.bootstrap_servers,
214
+ "group.id": self.group_id,
215
+ "auto.offset.reset": "earliest",
216
+ }
217
+
218
+ if self.security_protocol:
219
+ config["security.protocol"] = self.security_protocol
220
+ if self.sasl_mechanisms:
221
+ config["sasl.mechanisms"] = self.sasl_mechanisms
222
+ if self.sasl_username:
223
+ config["sasl.username"] = self.sasl_username
224
+ if self.sasl_password:
225
+ config["sasl.password"] = self.sasl_password
226
+
227
+ return Consumer(config)