omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,120 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """A source to extract Kafka messages.
16
+
17
+ When extraction starts, partitions length is checked -
18
+ data is read only up to it, overriding the default Kafka's
19
+ behavior of waiting for new messages in endless loop.
20
+ """
21
+
22
+ from contextlib import closing
23
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
24
+
25
+ import dlt
26
+ from confluent_kafka import Consumer, Message # type: ignore
27
+ from dlt.common import logger
28
+ from dlt.common.time import ensure_pendulum_datetime
29
+ from dlt.common.typing import TAnyDateTime, TDataItem
30
+
31
+ from .helpers import (
32
+ KafkaCredentials,
33
+ OffsetTracker,
34
+ default_msg_processor,
35
+ )
36
+
37
+
38
+ @dlt.resource(
39
+ name="kafka_messages",
40
+ table_name=lambda msg: msg["_kafka"]["topic"],
41
+ standalone=True,
42
+ )
43
+ def kafka_consumer(
44
+ topics: Union[str, List[str]],
45
+ credentials: Union[KafkaCredentials, Consumer] = dlt.secrets.value,
46
+ msg_processor: Optional[
47
+ Callable[[Message], Dict[str, Any]]
48
+ ] = default_msg_processor,
49
+ batch_size: Optional[int] = 3000,
50
+ batch_timeout: Optional[int] = 3,
51
+ start_from: Optional[TAnyDateTime] = None,
52
+ ) -> Iterable[TDataItem]:
53
+ """Extract recent messages from the given Kafka topics.
54
+
55
+ The resource tracks offsets for all the topics and partitions,
56
+ and so reads data incrementally.
57
+
58
+ Messages from different topics are saved in different tables.
59
+
60
+ Args:
61
+ topics (Union[str, List[str]]): Names of topics to extract.
62
+ credentials (Optional[Union[KafkaCredentials, Consumer]]):
63
+ Auth credentials or an initiated Kafka consumer. By default,
64
+ is taken from secrets.
65
+ msg_processor(Optional[Callable]): A function-converter,
66
+ which'll process every Kafka message after it's read and
67
+ before it's transfered to the destination.
68
+ batch_size (Optional[int]): Messages batch size to read at once.
69
+ batch_timeout (Optional[int]): Maximum time to wait for a batch
70
+ consume, in seconds.
71
+ start_from (Optional[TAnyDateTime]): A timestamp, at which to start
72
+ reading. Older messages are ignored.
73
+
74
+ Yields:
75
+ Iterable[TDataItem]: Kafka messages.
76
+ """
77
+ if not isinstance(topics, list):
78
+ topics = [topics]
79
+
80
+ if isinstance(credentials, Consumer):
81
+ consumer = credentials
82
+ elif isinstance(credentials, KafkaCredentials):
83
+ consumer = credentials.init_consumer()
84
+ else:
85
+ raise TypeError(
86
+ (
87
+ "Wrong credentials type provided. Need to be of type: "
88
+ "KafkaCredentials or confluent_kafka.Consumer"
89
+ )
90
+ )
91
+
92
+ if start_from is not None:
93
+ start_from = ensure_pendulum_datetime(start_from)
94
+
95
+ tracker = OffsetTracker(consumer, topics, dlt.current.resource_state(), start_from) # type: ignore
96
+
97
+ # read messages up to the maximum offsets,
98
+ # not waiting for new messages
99
+ with closing(consumer):
100
+ while True:
101
+ messages = consumer.consume(batch_size, timeout=batch_timeout)
102
+ if not messages:
103
+ break
104
+
105
+ batch = []
106
+ for msg in messages:
107
+ if msg.error():
108
+ err = msg.error()
109
+ if err.retriable() or not err.fatal():
110
+ logger.warning(f"ERROR: {err} - RETRYING")
111
+ else:
112
+ raise err
113
+ else:
114
+ batch.append(msg_processor(msg)) # type: ignore
115
+ tracker.renew(msg)
116
+
117
+ yield batch
118
+
119
+ if tracker.has_unread is False:
120
+ return
@@ -0,0 +1,241 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, List, Optional
16
+
17
+ from confluent_kafka import Consumer, Message, TopicPartition # type: ignore
18
+ from confluent_kafka.admin import TopicMetadata # type: ignore
19
+ from dlt import config
20
+ from dlt.common import pendulum
21
+ from dlt.common.configuration import configspec
22
+ from dlt.common.configuration.specs import CredentialsConfiguration
23
+ from dlt.common.time import ensure_pendulum_datetime
24
+ from dlt.common.typing import DictStrAny, TSecretValue
25
+ from dlt.common.utils import digest128
26
+
27
+
28
+ def default_msg_processor(msg: Message) -> Dict[str, Any]:
29
+ """Basic Kafka message processor.
30
+
31
+ Returns the message value and metadata. Timestamp consists of two values:
32
+ (type of the timestamp, timestamp). Type represents one of the Python
33
+ Kafka constants:
34
+ TIMESTAMP_NOT_AVAILABLE - Timestamps not supported by broker.
35
+ TIMESTAMP_CREATE_TIME - Message creation time (or source / producer time).
36
+ TIMESTAMP_LOG_APPEND_TIME - Broker receive time.
37
+
38
+ Args:
39
+ msg (confluent_kafka.Message): A single Kafka message.
40
+
41
+ Returns:
42
+ dict: Processed Kafka message.
43
+ """
44
+ ts = msg.timestamp()
45
+ topic = msg.topic()
46
+ partition = msg.partition()
47
+ key = msg.key()
48
+ if key is not None:
49
+ key = key.decode("utf-8")
50
+
51
+ return {
52
+ "_kafka": {
53
+ "partition": partition,
54
+ "topic": topic,
55
+ "key": key,
56
+ "offset": msg.offset(),
57
+ "ts": {
58
+ "type": ts[0],
59
+ "value": ensure_pendulum_datetime(ts[1] / 1e3),
60
+ },
61
+ "data": msg.value().decode("utf-8"),
62
+ },
63
+ "_kafka_msg_id": digest128(topic + str(partition) + str(key)),
64
+ }
65
+
66
+
67
+ class OffsetTracker(dict): # type: ignore
68
+ """Object to control offsets of the given topics.
69
+
70
+ Tracks all the partitions of the given topics with two params:
71
+ current offset and maximum offset (partition length).
72
+
73
+ Args:
74
+ consumer (confluent_kafka.Consumer): Kafka consumer.
75
+ topic_names (List): Names of topics to track.
76
+ pl_state (DictStrAny): Pipeline current state.
77
+ start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
78
+ are read. Older messages are ignored.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ consumer: Consumer,
84
+ topic_names: List[str],
85
+ pl_state: DictStrAny,
86
+ start_from: pendulum.DateTime = None, # type: ignore
87
+ ):
88
+ super().__init__()
89
+
90
+ self._consumer = consumer
91
+ self._topics = self._read_topics(topic_names)
92
+
93
+ # read/init current offsets
94
+ self._cur_offsets = pl_state.setdefault(
95
+ "offsets", {t_name: {} for t_name in topic_names}
96
+ )
97
+
98
+ self._init_partition_offsets(start_from)
99
+
100
+ def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
101
+ """Read the given topics metadata from Kafka.
102
+
103
+ Reads all the topics at once, instead of requesting
104
+ each in a separate call. Returns only those needed.
105
+
106
+ Args:
107
+ topic_names (list): Names of topics to be read.
108
+
109
+ Returns:
110
+ dict: Metadata of the given topics.
111
+ """
112
+ tracked_topics = {}
113
+ topics = self._consumer.list_topics().topics
114
+
115
+ for t_name in topic_names:
116
+ tracked_topics[t_name] = topics[t_name]
117
+
118
+ return tracked_topics
119
+
120
+ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
121
+ """Designate current and maximum offsets for every partition.
122
+
123
+ Current offsets are read from the state, if present. Set equal
124
+ to the partition beginning otherwise.
125
+
126
+ Args:
127
+ start_from (pendulum.DateTime): A timestamp, at which to start
128
+ reading. Older messages are ignored.
129
+ """
130
+ all_parts = []
131
+ for t_name, topic in self._topics.items():
132
+ self[t_name] = {}
133
+
134
+ # init all the topic partitions from the partitions' metadata
135
+ parts = [
136
+ TopicPartition(
137
+ t_name,
138
+ part,
139
+ start_from.int_timestamp * 1000 if start_from is not None else 0,
140
+ )
141
+ for part in topic.partitions
142
+ ]
143
+
144
+ # get offsets for the timestamp, if given
145
+ if start_from is not None:
146
+ ts_offsets = self._consumer.offsets_for_times(parts)
147
+
148
+ # designate current and maximum offsets for every partition
149
+ for i, part in enumerate(parts):
150
+ max_offset = self._consumer.get_watermark_offsets(part)[1]
151
+
152
+ if start_from is not None:
153
+ if ts_offsets[i].offset != -1:
154
+ cur_offset = ts_offsets[i].offset
155
+ else:
156
+ cur_offset = max_offset - 1
157
+ else:
158
+ cur_offset = (
159
+ self._cur_offsets[t_name].get(str(part.partition), -1) + 1
160
+ )
161
+
162
+ self[t_name][str(part.partition)] = {
163
+ "cur": cur_offset,
164
+ "max": max_offset,
165
+ }
166
+
167
+ parts[i].offset = cur_offset
168
+
169
+ all_parts += parts
170
+
171
+ # assign the current offsets to the consumer
172
+ self._consumer.assign(all_parts)
173
+
174
+ @property
175
+ def has_unread(self) -> bool:
176
+ """Check if there are unread messages in the tracked topics.
177
+
178
+ Returns:
179
+ bool: True, if there are messages to read, False if all
180
+ the current offsets are equal to their maximums.
181
+ """
182
+ for parts in self.values():
183
+ for part in parts.values():
184
+ if part["cur"] + 1 < part["max"]:
185
+ return True
186
+
187
+ return False
188
+
189
+ def renew(self, msg: Message) -> None:
190
+ """Update partition offset from the given message.
191
+
192
+ Args:
193
+ msg (confluent_kafka.Message): A read Kafka message.
194
+ """
195
+ topic = msg.topic()
196
+ partition = str(msg.partition())
197
+
198
+ offset = self[topic][partition]
199
+ offset["cur"] = msg.offset()
200
+
201
+ self._cur_offsets[topic][partition] = msg.offset()
202
+
203
+
204
+ @configspec
205
+ class KafkaCredentials(CredentialsConfiguration):
206
+ """Kafka source credentials.
207
+
208
+ NOTE: original Kafka credentials are written with a period, e.g.
209
+ bootstrap.servers. However, KafkaCredentials expect them to
210
+ use underscore symbols instead, e.g. bootstrap_servers.
211
+ """
212
+
213
+ bootstrap_servers: str = config.value
214
+ group_id: str = config.value
215
+ security_protocol: Optional[str] = None
216
+ sasl_mechanisms: Optional[str] = None
217
+ sasl_username: Optional[str] = None
218
+ sasl_password: Optional[TSecretValue] = None
219
+
220
+ def init_consumer(self) -> Consumer:
221
+ """Init a Kafka consumer from this credentials.
222
+
223
+ Returns:
224
+ confluent_kafka.Consumer: an initiated consumer.
225
+ """
226
+ config = {
227
+ "bootstrap.servers": self.bootstrap_servers,
228
+ "group.id": self.group_id,
229
+ "auto.offset.reset": "earliest",
230
+ }
231
+
232
+ if self.security_protocol:
233
+ config["security.protocol"] = self.security_protocol
234
+ if self.sasl_mechanisms:
235
+ config["sasl.mechanisms"] = self.sasl_mechanisms
236
+ if self.sasl_username:
237
+ config["sasl.username"] = self.sasl_username
238
+ if self.sasl_password:
239
+ config["sasl.password"] = self.sasl_password
240
+
241
+ return Consumer(config)
@@ -0,0 +1,153 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Reads messages from Kinesis queue."""
16
+
17
+ from typing import Iterable, List, Optional
18
+
19
+ import dlt
20
+ from dlt.common import json, pendulum
21
+ from dlt.common.configuration.specs import AwsCredentials
22
+ from dlt.common.time import ensure_pendulum_datetime
23
+ from dlt.common.typing import StrStr, TAnyDateTime, TDataItem
24
+ from dlt.common.utils import digest128
25
+
26
+ from .helpers import get_shard_iterator, get_stream_address, max_sequence_by_shard
27
+
28
+
29
+ @dlt.resource(
30
+ name=lambda args: args["stream_name"],
31
+ primary_key="kinesis_msg_id",
32
+ standalone=True,
33
+ max_table_nesting=0,
34
+ )
35
+ def kinesis_stream(
36
+ stream_name: str,
37
+ initial_at_timestamp: TAnyDateTime,
38
+ credentials: AwsCredentials,
39
+ last_msg: Optional[dlt.sources.incremental[StrStr]] = dlt.sources.incremental(
40
+ "kinesis", last_value_func=max_sequence_by_shard
41
+ ),
42
+ max_number_of_messages: int = None, # type: ignore
43
+ milliseconds_behind_latest: int = 1000,
44
+ parse_json: bool = True,
45
+ chunk_size: int = 1000,
46
+ ) -> Iterable[TDataItem]:
47
+ """Reads a kinesis stream and yields messages. Supports incremental loading. Parses messages as json by default.
48
+
49
+ Args:
50
+ stream_name (str): The name of the stream to read from. If not provided, the
51
+ value must be present in config/secrets
52
+ credentials (AwsCredentials): The credentials to use to connect to kinesis. If not provided,
53
+ the value from secrets or credentials present on the device will be used.
54
+ last_msg (Optional[dlt.sources.incremental]): An incremental over a mapping from shard_id to message sequence
55
+ that will be used to create shard iterators of type AFTER_SEQUENCE_NUMBER when loading incrementally.
56
+ initial_at_timestamp (TAnyDateTime): An initial timestamp used to generate AT_TIMESTAMP or LATEST iterator when timestamp value is 0
57
+ max_number_of_messages (int): Maximum number of messages to read in one run. Actual read may exceed that number by up to chunk_size. Defaults to None (no limit).
58
+ milliseconds_behind_latest (int): The number of milliseconds behind the top of the shard to stop reading messages, defaults to 1000.
59
+ parse_json (bool): If True, assumes that messages are json strings, parses them and returns instead of `data` (otherwise). Defaults to True.
60
+ chunk_size (int): The number of records to fetch at once. Defaults to 1000.
61
+ Yields:
62
+ Iterable[TDataItem]: Messages. Contain Kinesis envelope in `kinesis` and bytes data in `data` (if `parse_json` disabled)
63
+
64
+ """
65
+ session = credentials._to_botocore_session()
66
+ # the default timeouts are (60, 60) which is fine
67
+ kinesis_client = session.create_client("kinesis")
68
+ # normalize at_timestamp to pendulum
69
+ initial_at_datetime = (
70
+ None
71
+ if initial_at_timestamp is None
72
+ else ensure_pendulum_datetime(initial_at_timestamp)
73
+ )
74
+ # set it in state
75
+ resource_state = dlt.current.resource_state()
76
+ initial_at_datetime = resource_state.get(
77
+ "initial_at_timestamp", initial_at_datetime
78
+ )
79
+ # so next time we request shards at AT_TIMESTAMP that is now
80
+ resource_state["initial_at_timestamp"] = pendulum.now("UTC").subtract(seconds=1)
81
+
82
+ shards_list = kinesis_client.list_shards(**get_stream_address(stream_name))
83
+ shards: List[StrStr] = shards_list["Shards"]
84
+ while next_token := shards_list.get("NextToken"):
85
+ shards_list = kinesis_client.list_shards(NextToken=next_token)
86
+ shards.extend(shards_list)
87
+
88
+ shard_ids = [shard["ShardId"] for shard in shards]
89
+
90
+ # get next shard to fetch messages from
91
+ while shard_id := shard_ids.pop(0) if shard_ids else None:
92
+ shard_iterator, _ = get_shard_iterator(
93
+ kinesis_client,
94
+ stream_name,
95
+ shard_id,
96
+ last_msg, # type: ignore
97
+ initial_at_datetime, # type: ignore
98
+ )
99
+
100
+ while shard_iterator:
101
+ records = []
102
+ records_response = kinesis_client.get_records(
103
+ ShardIterator=shard_iterator,
104
+ Limit=chunk_size, # The size of data can be up to 1 MB, it must be controlled by the user
105
+ )
106
+
107
+ for record in records_response["Records"]:
108
+ sequence_number = record["SequenceNumber"]
109
+ content = record["Data"]
110
+
111
+ arrival_time = record["ApproximateArrivalTimestamp"]
112
+ arrival_timestamp = arrival_time.astimezone(pendulum.UTC)
113
+
114
+ message = {
115
+ "kinesis": {
116
+ "shard_id": shard_id,
117
+ "seq_no": sequence_number,
118
+ "ts": ensure_pendulum_datetime(arrival_timestamp),
119
+ "partition": record["PartitionKey"],
120
+ "stream_name": stream_name,
121
+ },
122
+ "kinesis_msg_id": digest128(shard_id + sequence_number),
123
+ }
124
+
125
+ if parse_json:
126
+ message.update(json.loadb(content))
127
+ else:
128
+ message["data"] = content
129
+ records.append(message)
130
+ yield records
131
+
132
+ # do not load more max_number_of_messages
133
+ if max_number_of_messages is not None:
134
+ max_number_of_messages -= len(records)
135
+ if max_number_of_messages <= 0:
136
+ return
137
+
138
+ # add child shards so we can request messages from them
139
+ child_shards = records_response.get("ChildShards", None)
140
+ if child_shards:
141
+ for child_shard in child_shards:
142
+ child_shard_id = child_shard["ShardId"]
143
+ if child_shard_id not in shards:
144
+ shard_ids.append(child_shard_id)
145
+
146
+ # gets 0 when no messages so we cutoff empty shards
147
+ records_ms_behind_latest = records_response.get("MillisBehindLatest", 0)
148
+ if records_ms_behind_latest < milliseconds_behind_latest:
149
+ # stop taking messages from shard
150
+ shard_iterator = None # type: ignore
151
+ else:
152
+ # continue taking messages
153
+ shard_iterator = records_response["NextShardIterator"]
@@ -0,0 +1,96 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Sequence, Tuple
16
+
17
+ import dlt
18
+ from dlt.common import pendulum
19
+ from dlt.common.typing import DictStrAny, DictStrStr, StrAny, StrStr
20
+
21
+
22
+ def get_shard_iterator(
23
+ kinesis_client: Any,
24
+ stream_name: str,
25
+ shard_id: str,
26
+ last_msg: dlt.sources.incremental[StrStr],
27
+ initial_at_timestamp: pendulum.DateTime | None,
28
+ ) -> Tuple[str, StrAny]:
29
+ """Gets shard `shard_id` of `stream_name` iterator. If `last_msg` incremental is present it may
30
+ contain last message sequence for shard_id. in that case AFTER_SEQUENCE_NUMBER is created.
31
+ If no message sequence is present, `initial_at_timestamp` is used for AT_TIMESTAMP or LATEST.
32
+ The final fallback is TRIM_HORIZON
33
+ """
34
+ sequence_state = (
35
+ {} if last_msg is None else last_msg.last_value or last_msg.initial_value or {}
36
+ )
37
+ iterator_params: DictStrAny
38
+ msg_sequence = sequence_state.get(shard_id, None)
39
+ if msg_sequence:
40
+ iterator_params = dict(
41
+ ShardIteratorType="AFTER_SEQUENCE_NUMBER",
42
+ StartingSequenceNumber=msg_sequence,
43
+ )
44
+ elif initial_at_timestamp is None:
45
+ # Fetch all records from the beginning
46
+ iterator_params = dict(ShardIteratorType="TRIM_HORIZON")
47
+
48
+ elif initial_at_timestamp.timestamp() == 0.0:
49
+ # will sets to latest i.e only the messages at the tip of the stream are read
50
+ iterator_params = dict(ShardIteratorType="LATEST")
51
+ else:
52
+ iterator_params = dict(
53
+ ShardIteratorType="AT_TIMESTAMP", Timestamp=initial_at_timestamp.timestamp()
54
+ )
55
+
56
+ shard_iterator: StrStr = kinesis_client.get_shard_iterator(
57
+ **get_stream_address(stream_name), ShardId=shard_id, **iterator_params
58
+ )
59
+ return shard_iterator["ShardIterator"], iterator_params
60
+
61
+
62
+ def max_sequence_by_shard(values: Sequence[StrStr]) -> StrStr:
63
+ """A last_value_function that operates on mapping of shard_id:msg_sequence defining the max"""
64
+ last_value = None
65
+ # if tuple/list contains only one element then return it
66
+ if len(values) == 1:
67
+ item = values[0]
68
+ else:
69
+ # item is kinesis metadata, last_value is previous state of the shards
70
+ item, last_value = values
71
+
72
+ if last_value is None:
73
+ last_value = {}
74
+ else:
75
+ last_value = dict(last_value) # always make a copy
76
+ shard_id = item["shard_id"]
77
+ # we compare message sequence at shard_id
78
+ last_value[shard_id] = max(item["seq_no"], last_value.get(shard_id, ""))
79
+ return last_value
80
+
81
+
82
+ def get_stream_address(stream_name: str) -> DictStrStr:
83
+ """
84
+ Return address of stream, either as StreamName or StreamARN, when applicable.
85
+
86
+ Examples:
87
+ - customer_events
88
+ - arn:aws:kinesis:eu-central-1:842404475894:stream/customer_events
89
+
90
+ https://docs.aws.amazon.com/kinesis/latest/APIReference/API_StreamDescription.html#Streams-Type-StreamDescription-StreamName
91
+ https://docs.aws.amazon.com/kinesis/latest/APIReference/API_StreamDescription.html#Streams-Type-StreamDescription-StreamARN
92
+ """
93
+ if stream_name.startswith("arn:"):
94
+ return {"StreamARN": stream_name}
95
+ else:
96
+ return {"StreamName": stream_name}