ingestr 0.7.7__py3-none-any.whl → 0.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +9 -0
- ingestr/src/.gitignore +10 -0
- ingestr/src/facebook_ads/__init__.py +197 -0
- ingestr/src/facebook_ads/exceptions.py +5 -0
- ingestr/src/facebook_ads/helpers.py +255 -0
- ingestr/src/facebook_ads/settings.py +208 -0
- ingestr/src/factory.py +9 -0
- ingestr/src/kafka/__init__.py +103 -0
- ingestr/src/kafka/helpers.py +227 -0
- ingestr/src/klaviyo/_init_.py +173 -0
- ingestr/src/klaviyo/client.py +212 -0
- ingestr/src/klaviyo/helpers.py +19 -0
- ingestr/src/sources.py +141 -0
- ingestr/src/version.py +1 -1
- {ingestr-0.7.7.dist-info → ingestr-0.7.8.dist-info}/METADATA +13 -1
- {ingestr-0.7.7.dist-info → ingestr-0.7.8.dist-info}/RECORD +19 -9
- {ingestr-0.7.7.dist-info → ingestr-0.7.8.dist-info}/WHEEL +0 -0
- {ingestr-0.7.7.dist-info → ingestr-0.7.8.dist-info}/entry_points.txt +0 -0
- {ingestr-0.7.7.dist-info → ingestr-0.7.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Facebook ads source settings and constants"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Dict, Iterator, Literal
|
|
4
|
+
|
|
5
|
+
from dlt.common.schema.typing import TTableSchemaColumns
|
|
6
|
+
from facebook_business.adobjects.abstractobject import AbstractObject
|
|
7
|
+
|
|
8
|
+
TFbMethod = Callable[..., Iterator[AbstractObject]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_FIELDS = (
|
|
12
|
+
"id",
|
|
13
|
+
"updated_time",
|
|
14
|
+
"created_time",
|
|
15
|
+
"name",
|
|
16
|
+
"status",
|
|
17
|
+
"effective_status",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
DEFAULT_CAMPAIGN_FIELDS = DEFAULT_FIELDS + (
|
|
21
|
+
"objective",
|
|
22
|
+
"start_time",
|
|
23
|
+
"stop_time",
|
|
24
|
+
"daily_budget",
|
|
25
|
+
"lifetime_budget",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
DEFAULT_AD_FIELDS = DEFAULT_FIELDS + (
|
|
29
|
+
"adset_id",
|
|
30
|
+
"campaign_id",
|
|
31
|
+
"creative",
|
|
32
|
+
"targeting",
|
|
33
|
+
"tracking_specs",
|
|
34
|
+
"conversion_specs",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
DEFAULT_ADSET_FIELDS = DEFAULT_FIELDS + (
|
|
38
|
+
"campaign_id",
|
|
39
|
+
"start_time",
|
|
40
|
+
"end_time",
|
|
41
|
+
"daily_budget",
|
|
42
|
+
"lifetime_budget",
|
|
43
|
+
"optimization_goal",
|
|
44
|
+
"promoted_object",
|
|
45
|
+
"billing_event",
|
|
46
|
+
"bid_amount",
|
|
47
|
+
"bid_strategy",
|
|
48
|
+
"targeting",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
DEFAULT_ADCREATIVE_FIELDS = (
|
|
52
|
+
"id",
|
|
53
|
+
"name",
|
|
54
|
+
"status",
|
|
55
|
+
"thumbnail_url",
|
|
56
|
+
"object_story_spec",
|
|
57
|
+
"effective_object_story_id",
|
|
58
|
+
"call_to_action_type",
|
|
59
|
+
"object_type",
|
|
60
|
+
"template_url",
|
|
61
|
+
"url_tags",
|
|
62
|
+
"instagram_actor_id",
|
|
63
|
+
"product_set_id",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
DEFAULT_LEAD_FIELDS = (
|
|
67
|
+
"id",
|
|
68
|
+
"created_time",
|
|
69
|
+
"ad_id",
|
|
70
|
+
"ad_name",
|
|
71
|
+
"adset_id",
|
|
72
|
+
"adset_name",
|
|
73
|
+
"campaign_id",
|
|
74
|
+
"campaign_name",
|
|
75
|
+
"form_id",
|
|
76
|
+
"field_data",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
DEFAULT_INSIGHT_FIELDS = (
|
|
80
|
+
"campaign_id",
|
|
81
|
+
"adset_id",
|
|
82
|
+
"ad_id",
|
|
83
|
+
"date_start",
|
|
84
|
+
"date_stop",
|
|
85
|
+
"reach",
|
|
86
|
+
"impressions",
|
|
87
|
+
"frequency",
|
|
88
|
+
"clicks",
|
|
89
|
+
"unique_clicks",
|
|
90
|
+
"ctr",
|
|
91
|
+
"unique_ctr",
|
|
92
|
+
"cpc",
|
|
93
|
+
"cpm",
|
|
94
|
+
"cpp",
|
|
95
|
+
"spend",
|
|
96
|
+
"actions",
|
|
97
|
+
"action_values",
|
|
98
|
+
"cost_per_action_type",
|
|
99
|
+
"website_ctr",
|
|
100
|
+
"account_currency",
|
|
101
|
+
"ad_click_actions",
|
|
102
|
+
"ad_name",
|
|
103
|
+
"adset_name",
|
|
104
|
+
"campaign_name",
|
|
105
|
+
"country",
|
|
106
|
+
"dma",
|
|
107
|
+
"full_view_impressions",
|
|
108
|
+
"full_view_reach",
|
|
109
|
+
"inline_link_click_ctr",
|
|
110
|
+
"outbound_clicks",
|
|
111
|
+
"reach",
|
|
112
|
+
"social_spend",
|
|
113
|
+
"spend",
|
|
114
|
+
"website_ctr",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
TInsightsLevels = Literal["account", "campaign", "adset", "ad"]
|
|
118
|
+
|
|
119
|
+
INSIGHTS_PRIMARY_KEY = ("campaign_id", "adset_id", "ad_id", "date_start")
|
|
120
|
+
|
|
121
|
+
ALL_STATES = {
|
|
122
|
+
"effective_status": [
|
|
123
|
+
"ACTIVE",
|
|
124
|
+
"PAUSED",
|
|
125
|
+
"DELETED",
|
|
126
|
+
"PENDING_REVIEW",
|
|
127
|
+
"DISAPPROVED",
|
|
128
|
+
"PREAPPROVED",
|
|
129
|
+
"PENDING_BILLING_INFO",
|
|
130
|
+
"CAMPAIGN_PAUSED",
|
|
131
|
+
"ARCHIVED",
|
|
132
|
+
"ADSET_PAUSED",
|
|
133
|
+
]
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
TInsightsBreakdownOptions = Literal[
|
|
137
|
+
"ads_insights",
|
|
138
|
+
"ads_insights_age_and_gender",
|
|
139
|
+
"ads_insights_country",
|
|
140
|
+
"ads_insights_platform_and_device",
|
|
141
|
+
"ads_insights_region",
|
|
142
|
+
"ads_insights_dma",
|
|
143
|
+
"ads_insights_hourly_advertiser",
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
ALL_ACTION_ATTRIBUTION_WINDOWS = (
|
|
147
|
+
"1d_click",
|
|
148
|
+
"7d_click",
|
|
149
|
+
"28d_click",
|
|
150
|
+
"1d_view",
|
|
151
|
+
"7d_view",
|
|
152
|
+
"28d_view",
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
ALL_ACTION_BREAKDOWNS = ("action_type", "action_target_id", "action_destination")
|
|
156
|
+
|
|
157
|
+
INSIGHTS_BREAKDOWNS_OPTIONS: Dict[TInsightsBreakdownOptions, Any] = {
|
|
158
|
+
"ads_insights": {"breakdowns": (), "fields": ()},
|
|
159
|
+
"ads_insights_age_and_gender": {
|
|
160
|
+
"breakdowns": ("age", "gender"),
|
|
161
|
+
"fields": ("age", "gender"),
|
|
162
|
+
},
|
|
163
|
+
"ads_insights_country": {"breakdowns": ("country",), "fields": ("country",)},
|
|
164
|
+
"ads_insights_platform_and_device": {
|
|
165
|
+
"breakdowns": ("publisher_platform", "platform_position", "impression_device"),
|
|
166
|
+
"fields": ("publisher_platform", "platform_position", "impression_device"),
|
|
167
|
+
},
|
|
168
|
+
"ads_insights_region": {"breakdowns": ("region",), "fields": ("region",)},
|
|
169
|
+
"ads_insights_dma": {"breakdowns": ("dma",), "fields": ("dma",)},
|
|
170
|
+
"ads_insights_hourly_advertiser": {
|
|
171
|
+
"breakdowns": ("hourly_stats_aggregated_by_advertiser_time_zone",),
|
|
172
|
+
"fields": ("hourly_stats_aggregated_by_advertiser_time_zone",),
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
INSIGHT_FIELDS_TYPES: TTableSchemaColumns = {
|
|
177
|
+
"campaign_id": {"data_type": "bigint"},
|
|
178
|
+
"adset_id": {"data_type": "bigint"},
|
|
179
|
+
"ad_id": {"data_type": "bigint"},
|
|
180
|
+
"date_start": {"data_type": "timestamp"},
|
|
181
|
+
"date_stop": {"data_type": "timestamp"},
|
|
182
|
+
"reach": {"data_type": "bigint"},
|
|
183
|
+
"impressions": {"data_type": "bigint"},
|
|
184
|
+
"frequency": {"data_type": "decimal"},
|
|
185
|
+
"clicks": {"data_type": "bigint"},
|
|
186
|
+
"unique_clicks": {"data_type": "bigint"},
|
|
187
|
+
"ctr": {"data_type": "decimal"},
|
|
188
|
+
"unique_ctr": {"data_type": "decimal"},
|
|
189
|
+
"cpc": {"data_type": "decimal"},
|
|
190
|
+
"cpm": {"data_type": "decimal"},
|
|
191
|
+
"cpp": {"data_type": "decimal"},
|
|
192
|
+
"spend": {"data_type": "decimal"},
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
INVALID_INSIGHTS_FIELDS = [
|
|
196
|
+
"impression_device",
|
|
197
|
+
"publisher_platform",
|
|
198
|
+
"platform_position",
|
|
199
|
+
"age",
|
|
200
|
+
"gender",
|
|
201
|
+
"country",
|
|
202
|
+
"placement",
|
|
203
|
+
"region",
|
|
204
|
+
"dma",
|
|
205
|
+
"hourly_stats_aggregated_by_advertiser_time_zone",
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
FACEBOOK_INSIGHTS_RETENTION_PERIOD = 37 # months
|
ingestr/src/factory.py
CHANGED
|
@@ -17,9 +17,12 @@ from ingestr.src.destinations import (
|
|
|
17
17
|
from ingestr.src.sources import (
|
|
18
18
|
AirtableSource,
|
|
19
19
|
ChessSource,
|
|
20
|
+
FacebookAdsSource,
|
|
20
21
|
GoogleSheetsSource,
|
|
21
22
|
GorgiasSource,
|
|
22
23
|
HubspotSource,
|
|
24
|
+
KafkaSource,
|
|
25
|
+
KlaviyoSource,
|
|
23
26
|
LocalCsvSource,
|
|
24
27
|
MongoDbSource,
|
|
25
28
|
NotionSource,
|
|
@@ -111,12 +114,18 @@ class SourceDestinationFactory:
|
|
|
111
114
|
return ChessSource()
|
|
112
115
|
elif self.source_scheme == "stripe":
|
|
113
116
|
return StripeAnalyticsSource()
|
|
117
|
+
elif self.source_scheme == "facebookads":
|
|
118
|
+
return FacebookAdsSource()
|
|
114
119
|
elif self.source_scheme == "slack":
|
|
115
120
|
return SlackSource()
|
|
116
121
|
elif self.source_scheme == "hubspot":
|
|
117
122
|
return HubspotSource()
|
|
118
123
|
elif self.source_scheme == "airtable":
|
|
119
124
|
return AirtableSource()
|
|
125
|
+
elif self.source_scheme == "klaviyo":
|
|
126
|
+
return KlaviyoSource()
|
|
127
|
+
elif self.source_scheme == "kafka":
|
|
128
|
+
return KafkaSource()
|
|
120
129
|
else:
|
|
121
130
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
122
131
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""A source to extract Kafka messages.
|
|
2
|
+
|
|
3
|
+
When extraction starts, partitions length is checked -
|
|
4
|
+
data is read only up to it, overriding the default Kafka's
|
|
5
|
+
behavior of waiting for new messages in endless loop.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from contextlib import closing
|
|
9
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import dlt
|
|
12
|
+
from confluent_kafka import Consumer, Message # type: ignore
|
|
13
|
+
from dlt.common import logger
|
|
14
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
15
|
+
from dlt.common.typing import TAnyDateTime, TDataItem
|
|
16
|
+
|
|
17
|
+
from .helpers import (
|
|
18
|
+
KafkaCredentials,
|
|
19
|
+
OffsetTracker,
|
|
20
|
+
default_msg_processor,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dlt.resource(
|
|
25
|
+
name="kafka_messages",
|
|
26
|
+
table_name=lambda msg: msg["_kafka"]["topic"],
|
|
27
|
+
standalone=True,
|
|
28
|
+
)
|
|
29
|
+
def kafka_consumer(
|
|
30
|
+
topics: Union[str, List[str]],
|
|
31
|
+
credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value,
|
|
32
|
+
msg_processor: Optional[
|
|
33
|
+
Callable[[Message], Dict[str, Any]]
|
|
34
|
+
] = default_msg_processor,
|
|
35
|
+
batch_size: Optional[int] = 3000,
|
|
36
|
+
batch_timeout: Optional[int] = 3,
|
|
37
|
+
start_from: Optional[TAnyDateTime] = None,
|
|
38
|
+
) -> Iterable[TDataItem]:
|
|
39
|
+
"""Extract recent messages from the given Kafka topics.
|
|
40
|
+
|
|
41
|
+
The resource tracks offsets for all the topics and partitions,
|
|
42
|
+
and so reads data incrementally.
|
|
43
|
+
|
|
44
|
+
Messages from different topics are saved in different tables.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
topics (Union[str, List[str]]): Names of topics to extract.
|
|
48
|
+
credentials (Optional[Union[KafkaCredentials, Consumer]]):
|
|
49
|
+
Auth credentials or an initiated Kafka consumer. By default,
|
|
50
|
+
is taken from secrets.
|
|
51
|
+
msg_processor(Optional[Callable]): A function-converter,
|
|
52
|
+
which'll process every Kafka message after it's read and
|
|
53
|
+
before it's transfered to the destination.
|
|
54
|
+
batch_size (Optional[int]): Messages batch size to read at once.
|
|
55
|
+
batch_timeout (Optional[int]): Maximum time to wait for a batch
|
|
56
|
+
consume, in seconds.
|
|
57
|
+
start_from (Optional[TAnyDateTime]): A timestamp, at which to start
|
|
58
|
+
reading. Older messages are ignored.
|
|
59
|
+
|
|
60
|
+
Yields:
|
|
61
|
+
Iterable[TDataItem]: Kafka messages.
|
|
62
|
+
"""
|
|
63
|
+
if not isinstance(topics, list):
|
|
64
|
+
topics = [topics]
|
|
65
|
+
|
|
66
|
+
if isinstance(credentials, Consumer):
|
|
67
|
+
consumer = credentials
|
|
68
|
+
elif isinstance(credentials, KafkaCredentials):
|
|
69
|
+
consumer = credentials.init_consumer()
|
|
70
|
+
else:
|
|
71
|
+
raise TypeError(
|
|
72
|
+
(
|
|
73
|
+
"Wrong credentials type provided. Need to be of type: "
|
|
74
|
+
"KafkaCredentials or confluent_kafka.Consumer"
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if start_from is not None:
|
|
79
|
+
start_from = ensure_pendulum_datetime(start_from)
|
|
80
|
+
|
|
81
|
+
tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from) # type: ignore
|
|
82
|
+
|
|
83
|
+
# read messages up to the maximum offsets,
|
|
84
|
+
# not waiting for new messages
|
|
85
|
+
with closing(consumer):
|
|
86
|
+
while tracker.has_unread:
|
|
87
|
+
messages = consumer.consume(batch_size, timeout=batch_timeout)
|
|
88
|
+
if not messages:
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
batch = []
|
|
92
|
+
for msg in messages:
|
|
93
|
+
if msg.error():
|
|
94
|
+
err = msg.error()
|
|
95
|
+
if err.retriable() or not err.fatal():
|
|
96
|
+
logger.warning(f"ERROR: {err} - RETRYING")
|
|
97
|
+
else:
|
|
98
|
+
raise err
|
|
99
|
+
else:
|
|
100
|
+
batch.append(msg_processor(msg)) # type: ignore
|
|
101
|
+
tracker.renew(msg)
|
|
102
|
+
|
|
103
|
+
yield batch
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from confluent_kafka import Consumer, Message, TopicPartition # type: ignore
|
|
4
|
+
from confluent_kafka.admin import TopicMetadata # type: ignore
|
|
5
|
+
from dlt import config
|
|
6
|
+
from dlt.common import pendulum
|
|
7
|
+
from dlt.common.configuration import configspec
|
|
8
|
+
from dlt.common.configuration.specs import CredentialsConfiguration
|
|
9
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
10
|
+
from dlt.common.typing import DictStrAny, TSecretValue
|
|
11
|
+
from dlt.common.utils import digest128
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def default_msg_processor(msg: Message) -> Dict[str, Any]:
|
|
15
|
+
"""Basic Kafka message processor.
|
|
16
|
+
|
|
17
|
+
Returns the message value and metadata. Timestamp consists of two values:
|
|
18
|
+
(type of the timestamp, timestamp). Type represents one of the Python
|
|
19
|
+
Kafka constants:
|
|
20
|
+
TIMESTAMP_NOT_AVAILABLE - Timestamps not supported by broker.
|
|
21
|
+
TIMESTAMP_CREATE_TIME - Message creation time (or source / producer time).
|
|
22
|
+
TIMESTAMP_LOG_APPEND_TIME - Broker receive time.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
msg (confluent_kafka.Message): A single Kafka message.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
dict: Processed Kafka message.
|
|
29
|
+
"""
|
|
30
|
+
ts = msg.timestamp()
|
|
31
|
+
topic = msg.topic()
|
|
32
|
+
partition = msg.partition()
|
|
33
|
+
key = msg.key()
|
|
34
|
+
if key is not None:
|
|
35
|
+
key = key.decode("utf-8")
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"_kafka": {
|
|
39
|
+
"partition": partition,
|
|
40
|
+
"topic": topic,
|
|
41
|
+
"key": key,
|
|
42
|
+
"offset": msg.offset(),
|
|
43
|
+
"ts": {
|
|
44
|
+
"type": ts[0],
|
|
45
|
+
"value": ensure_pendulum_datetime(ts[1] / 1e3),
|
|
46
|
+
},
|
|
47
|
+
"data": msg.value().decode("utf-8"),
|
|
48
|
+
},
|
|
49
|
+
"_kafka_msg_id": digest128(topic + str(partition) + str(key)),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class OffsetTracker(dict): # type: ignore
|
|
54
|
+
"""Object to control offsets of the given topics.
|
|
55
|
+
|
|
56
|
+
Tracks all the partitions of the given topics with two params:
|
|
57
|
+
current offset and maximum offset (partition length).
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
consumer (confluent_kafka.Consumer): Kafka consumer.
|
|
61
|
+
topic_names (List): Names of topics to track.
|
|
62
|
+
pl_state (DictStrAny): Pipeline current state.
|
|
63
|
+
start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
|
|
64
|
+
are read. Older messages are ignored.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
consumer: Consumer,
|
|
70
|
+
topic_names: List[str],
|
|
71
|
+
pl_state: DictStrAny,
|
|
72
|
+
start_from: pendulum.DateTime = None, # type: ignore
|
|
73
|
+
):
|
|
74
|
+
super().__init__()
|
|
75
|
+
|
|
76
|
+
self._consumer = consumer
|
|
77
|
+
self._topics = self._read_topics(topic_names)
|
|
78
|
+
|
|
79
|
+
# read/init current offsets
|
|
80
|
+
self._cur_offsets = pl_state.setdefault(
|
|
81
|
+
"offsets", {t_name: {} for t_name in topic_names}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
self._init_partition_offsets(start_from)
|
|
85
|
+
|
|
86
|
+
def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
|
|
87
|
+
"""Read the given topics metadata from Kafka.
|
|
88
|
+
|
|
89
|
+
Reads all the topics at once, instead of requesting
|
|
90
|
+
each in a separate call. Returns only those needed.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
topic_names (list): Names of topics to be read.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
dict: Metadata of the given topics.
|
|
97
|
+
"""
|
|
98
|
+
tracked_topics = {}
|
|
99
|
+
topics = self._consumer.list_topics().topics
|
|
100
|
+
|
|
101
|
+
for t_name in topic_names:
|
|
102
|
+
tracked_topics[t_name] = topics[t_name]
|
|
103
|
+
|
|
104
|
+
return tracked_topics
|
|
105
|
+
|
|
106
|
+
def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
|
|
107
|
+
"""Designate current and maximum offsets for every partition.
|
|
108
|
+
|
|
109
|
+
Current offsets are read from the state, if present. Set equal
|
|
110
|
+
to the partition beginning otherwise.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
start_from (pendulum.DateTime): A timestamp, at which to start
|
|
114
|
+
reading. Older messages are ignored.
|
|
115
|
+
"""
|
|
116
|
+
all_parts = []
|
|
117
|
+
for t_name, topic in self._topics.items():
|
|
118
|
+
self[t_name] = {}
|
|
119
|
+
|
|
120
|
+
# init all the topic partitions from the partitions' metadata
|
|
121
|
+
parts = [
|
|
122
|
+
TopicPartition(
|
|
123
|
+
t_name,
|
|
124
|
+
part,
|
|
125
|
+
start_from.int_timestamp * 1000 if start_from is not None else 0,
|
|
126
|
+
)
|
|
127
|
+
for part in topic.partitions
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# get offsets for the timestamp, if given
|
|
131
|
+
if start_from is not None:
|
|
132
|
+
ts_offsets = self._consumer.offsets_for_times(parts)
|
|
133
|
+
|
|
134
|
+
# designate current and maximum offsets for every partition
|
|
135
|
+
for i, part in enumerate(parts):
|
|
136
|
+
max_offset = self._consumer.get_watermark_offsets(part)[1]
|
|
137
|
+
|
|
138
|
+
if start_from is not None:
|
|
139
|
+
if ts_offsets[i].offset != -1:
|
|
140
|
+
cur_offset = ts_offsets[i].offset
|
|
141
|
+
else:
|
|
142
|
+
cur_offset = max_offset - 1
|
|
143
|
+
else:
|
|
144
|
+
cur_offset = (
|
|
145
|
+
self._cur_offsets[t_name].get(str(part.partition), -1) + 1
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
self[t_name][str(part.partition)] = {
|
|
149
|
+
"cur": cur_offset,
|
|
150
|
+
"max": max_offset,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
parts[i].offset = cur_offset
|
|
154
|
+
|
|
155
|
+
all_parts += parts
|
|
156
|
+
|
|
157
|
+
# assign the current offsets to the consumer
|
|
158
|
+
self._consumer.assign(all_parts)
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def has_unread(self) -> bool:
|
|
162
|
+
"""Check if there are unread messages in the tracked topics.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
bool: True, if there are messages to read, False if all
|
|
166
|
+
the current offsets are equal to their maximums.
|
|
167
|
+
"""
|
|
168
|
+
for parts in self.values():
|
|
169
|
+
for part in parts.values():
|
|
170
|
+
if part["cur"] + 1 < part["max"]:
|
|
171
|
+
return True
|
|
172
|
+
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
def renew(self, msg: Message) -> None:
|
|
176
|
+
"""Update partition offset from the given message.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
msg (confluent_kafka.Message): A read Kafka message.
|
|
180
|
+
"""
|
|
181
|
+
topic = msg.topic()
|
|
182
|
+
partition = str(msg.partition())
|
|
183
|
+
|
|
184
|
+
offset = self[topic][partition]
|
|
185
|
+
offset["cur"] = msg.offset()
|
|
186
|
+
|
|
187
|
+
self._cur_offsets[topic][partition] = msg.offset()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@configspec
|
|
191
|
+
class KafkaCredentials(CredentialsConfiguration):
|
|
192
|
+
"""Kafka source credentials.
|
|
193
|
+
|
|
194
|
+
NOTE: original Kafka credentials are written with a period, e.g.
|
|
195
|
+
bootstrap.servers. However, KafkaCredentials expect them to
|
|
196
|
+
use underscore symbols instead, e.g. bootstrap_servers.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
bootstrap_servers: str = config.value
|
|
200
|
+
group_id: str = config.value
|
|
201
|
+
security_protocol: Optional[str] = None
|
|
202
|
+
sasl_mechanisms: Optional[str] = None
|
|
203
|
+
sasl_username: Optional[str] = None
|
|
204
|
+
sasl_password: Optional[TSecretValue] = None
|
|
205
|
+
|
|
206
|
+
def init_consumer(self) -> Consumer:
|
|
207
|
+
"""Init a Kafka consumer from this credentials.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
confluent_kafka.Consumer: an initiated consumer.
|
|
211
|
+
"""
|
|
212
|
+
config = {
|
|
213
|
+
"bootstrap.servers": self.bootstrap_servers,
|
|
214
|
+
"group.id": self.group_id,
|
|
215
|
+
"auto.offset.reset": "earliest",
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if self.security_protocol:
|
|
219
|
+
config["security.protocol"] = self.security_protocol
|
|
220
|
+
if self.sasl_mechanisms:
|
|
221
|
+
config["sasl.mechanisms"] = self.sasl_mechanisms
|
|
222
|
+
if self.sasl_username:
|
|
223
|
+
config["sasl.username"] = self.sasl_username
|
|
224
|
+
if self.sasl_password:
|
|
225
|
+
config["sasl.password"] = self.sasl_password
|
|
226
|
+
|
|
227
|
+
return Consumer(config)
|