bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +33 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +128 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
  18. bizon/connectors/destinations/file/src/destination.py +56 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +69 -0
  43. bizon/connectors/sources/kafka/src/decode.py +93 -0
  44. bizon/connectors/sources/kafka/src/source.py +381 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +83 -0
  55. bizon/{destinations → destination}/destination.py +103 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +121 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +113 -24
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +39 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +153 -0
  78. bizon/monitoring/monitor.py +71 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +30 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
  90. bizon-0.1.2.dist-info/RECORD +123 -0
  91. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  92. bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon/destinations/config.py +0 -47
  96. bizon/destinations/file/src/destination.py +0 -27
  97. bizon/sources/kafka/src/source.py +0 -357
  98. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  99. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  100. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  101. bizon-0.1.0.dist-info/RECORD +0 -93
  102. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  103. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  104. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  105. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  106. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  107. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  108. /bizon/{destinations → destination}/models.py +0 -0
  109. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  110. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,93 @@
1
+ import io
2
+ import struct
3
+ from functools import lru_cache
4
+ from typing import Tuple, Union
5
+
6
+ import fastavro
7
+ from avro.schema import Schema
8
+ from confluent_kafka.serialization import SerializationError
9
+
10
+ # Constants for schema ID byte sizes
11
+ APICURIO_SCHEMA_ID_BYTES = 8
12
+ CONFLUENT_SCHEMA_ID_BYTES = 4
13
+ MAGIC_BYTE = 0
14
+
15
+
16
+ class Hashabledict(dict):
17
+ """A hashable dictionary for caching purposes"""
18
+
19
+ def __hash__(self):
20
+ return hash(frozenset(self.items()))
21
+
22
+
23
+ @lru_cache(maxsize=None)
24
+ def parse_global_id_from_serialized_message(message: bytes) -> Tuple[int, int]:
25
+ """
26
+ Parse the global id from the serialized message.
27
+
28
+ Args:
29
+ message: The serialized message bytes
30
+
31
+ Returns:
32
+ Tuple of (schema_id, number_of_bytes_used_for_schema_id)
33
+
34
+ Raises:
35
+ SerializationError: If the message is invalid or missing schema id
36
+ """
37
+ size = len(message)
38
+
39
+ if size < CONFLUENT_SCHEMA_ID_BYTES + 1:
40
+ raise SerializationError("Invalid message. Missing schema id")
41
+
42
+ # Create BytesIO object for easier reading
43
+ message_buffer = io.BytesIO(message)
44
+ message_buffer.seek(0)
45
+ magic_byte = message_buffer.read(1)
46
+
47
+ if magic_byte != bytes([MAGIC_BYTE]):
48
+ raise SerializationError(
49
+ f"Unexpected magic byte {magic_byte}. This message was not produced with a Schema Registry serializer"
50
+ )
51
+
52
+ # Read Confluent schema ID (4 bytes + 1 magic byte)
53
+ message_buffer.seek(0)
54
+ schema_id = struct.unpack(">bI", message_buffer.read(CONFLUENT_SCHEMA_ID_BYTES + 1))[1]
55
+
56
+ # If schema_id is 0, try reading as Apicurio format (8 bytes)
57
+ if schema_id == 0:
58
+ if size < APICURIO_SCHEMA_ID_BYTES + 1:
59
+ raise SerializationError("Invalid Apicurio message. Missing schema id")
60
+ message_buffer.seek(0)
61
+ schema_id = struct.unpack(">bq", message_buffer.read(APICURIO_SCHEMA_ID_BYTES + 1))[1]
62
+ return schema_id, APICURIO_SCHEMA_ID_BYTES
63
+ else:
64
+ return schema_id, CONFLUENT_SCHEMA_ID_BYTES
65
+
66
+
67
+ def decode_avro_message(message_value: bytes, nb_bytes_schema_id: int, avro_schema: Union[Schema, dict]) -> dict:
68
+ """
69
+ Decode an Avro message.
70
+
71
+ Args:
72
+ message_value: The raw message bytes
73
+ nb_bytes_schema_id: Number of bytes used for schema ID
74
+ avro_schema: The Avro schema (as Schema object or dict)
75
+
76
+ Returns:
77
+ Decoded message as a dictionary
78
+ """
79
+ # Create BytesIO from message bytes
80
+ message_bytes = io.BytesIO(message_value)
81
+
82
+ # Skip magic byte and schema ID bytes
83
+ message_bytes.seek(nb_bytes_schema_id + 1)
84
+
85
+ # Decode the message using fastavro
86
+ if isinstance(avro_schema, Schema):
87
+ schema_dict = avro_schema.to_json()
88
+ else:
89
+ schema_dict = avro_schema
90
+
91
+ data = fastavro.schemaless_reader(message_bytes, schema_dict)
92
+
93
+ return data
@@ -0,0 +1,381 @@
1
+ import traceback
2
+ from datetime import datetime
3
+ from functools import lru_cache
4
+ from typing import Any, List, Mapping, Tuple
5
+
6
+ import orjson
7
+ from avro.schema import Schema, parse
8
+ from confluent_kafka import (
9
+ Consumer,
10
+ KafkaError,
11
+ KafkaException,
12
+ Message,
13
+ TopicPartition,
14
+ )
15
+ from confluent_kafka.cimpl import KafkaException as CimplKafkaException
16
+ from loguru import logger
17
+ from pydantic import BaseModel
18
+ from pytz import UTC
19
+
20
+ from bizon.source.auth.config import AuthType
21
+ from bizon.source.callback import AbstractSourceCallback
22
+ from bizon.source.config import SourceSyncModes
23
+ from bizon.source.models import SourceIteration, SourceRecord
24
+ from bizon.source.source import AbstractSource
25
+
26
+ from .callback import KafkaSourceCallback
27
+ from .config import KafkaSourceConfig, MessageEncoding, SchemaRegistryType
28
+ from .decode import (
29
+ Hashabledict,
30
+ decode_avro_message,
31
+ parse_global_id_from_serialized_message,
32
+ )
33
+
34
+
35
+ class SchemaNotFound(Exception):
36
+ """Schema not found in the Schema Registry"""
37
+
38
+ pass
39
+
40
+
41
+ class OffsetPartition(BaseModel):
42
+ first: int
43
+ last: int
44
+ to_fetch: int = 0
45
+
46
+
47
+ class TopicOffsets(BaseModel):
48
+ name: str
49
+ partitions: Mapping[int, OffsetPartition]
50
+
51
+ def set_partition_offset(self, index: int, offset: int):
52
+ self.partitions[index].to_fetch = offset
53
+
54
+ def get_partition_offset(self, index: int) -> int:
55
+ return self.partitions[index].to_fetch
56
+
57
+ @property
58
+ def total_offset(self) -> int:
59
+ return sum([partition.last for partition in self.partitions.values()])
60
+
61
+
62
+ class KafkaSource(AbstractSource):
63
+
64
+ def __init__(self, config: KafkaSourceConfig):
65
+ super().__init__(config)
66
+
67
+ self.config: KafkaSourceConfig = config
68
+
69
+ # Kafka consumer configuration.
70
+ if self.config.authentication.type == AuthType.BASIC:
71
+ self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
72
+ self.config.consumer_config["sasl.username"] = self.config.authentication.params.username
73
+ self.config.consumer_config["sasl.password"] = self.config.authentication.params.password
74
+
75
+ # Set the bootstrap servers and group id
76
+ self.config.consumer_config["group.id"] = self.config.group_id
77
+ self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
78
+
79
+ # Consumer instance
80
+ self.consumer = Consumer(self.config.consumer_config)
81
+
82
+ # Map topic_name to destination_id
83
+ self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
84
+
85
+ @staticmethod
86
+ def streams() -> List[str]:
87
+ return ["topic"]
88
+
89
+ def get_authenticator(self):
90
+ # We don't use HTTP authentication for Kafka
91
+ # We use confluence_kafka library to authenticate
92
+ pass
93
+
94
+ @staticmethod
95
+ def get_config_class() -> AbstractSource:
96
+ return KafkaSourceConfig
97
+
98
+ def get_source_callback_instance(self) -> AbstractSourceCallback:
99
+ """Return an instance of the source callback, used to commit the offsets of the iterations"""
100
+ return KafkaSourceCallback(config=self.config)
101
+
102
+ def check_connection(self) -> Tuple[bool | Any | None]:
103
+ """Check the connection to the Kafka source"""
104
+
105
+ logger.info(f"Found: {len(self.consumer.list_topics().topics)} topics")
106
+
107
+ topics = self.consumer.list_topics().topics
108
+
109
+ config_topics = [topic.name for topic in self.config.topics]
110
+
111
+ # Display consumer config
112
+ # We ignore the key sasl.password and sasl.username
113
+ consumer_config = self.config.consumer_config.copy()
114
+ consumer_config.pop("sasl.password", None)
115
+ consumer_config.pop("sasl.username", None)
116
+ logger.info(f"Consumer config: {consumer_config}")
117
+
118
+ for topic in config_topics:
119
+ if topic not in topics:
120
+ logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
121
+ return False, f"Topic {topic} not found"
122
+
123
+ logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
124
+
125
+ return True, None
126
+
127
+ def get_number_of_partitions(self, topic: str) -> int:
128
+ """Get the number of partitions for the topic"""
129
+ return len(self.consumer.list_topics().topics[topic].partitions)
130
+
131
+ def get_offset_partitions(self, topic: str) -> TopicOffsets:
132
+ """Get the offsets for each partition of the topic"""
133
+
134
+ partitions: Mapping[int, OffsetPartition] = {}
135
+
136
+ for i in range(self.get_number_of_partitions(topic)):
137
+ offsets = self.consumer.get_watermark_offsets(TopicPartition(topic, i))
138
+ partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
139
+
140
+ return TopicOffsets(name=topic, partitions=partitions)
141
+
142
+ def get_total_records_count(self) -> int | None:
143
+ """Get the total number of records in the topic, sum of offsets for each partition"""
144
+ # Init the consumer
145
+ total_records = 0
146
+ for topic in [topic.name for topic in self.config.topics]:
147
+ total_records += self.get_offset_partitions(topic).total_offset
148
+ return total_records
149
+
150
+ @lru_cache(maxsize=None)
151
+ def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
152
+ """Get the schema from the registry, return a hashable dict and an avro schema object"""
153
+
154
+ # Apicurio
155
+ if self.config.authentication.schema_registry_type == SchemaRegistryType.APICURIO:
156
+ try:
157
+ response = self.session.get(
158
+ f"{self.config.authentication.schema_registry_url}/apis/registry/v2/ids/globalIds/{global_id}",
159
+ auth=(
160
+ self.config.authentication.schema_registry_username,
161
+ self.config.authentication.schema_registry_password,
162
+ ),
163
+ )
164
+ if response.status_code == 404:
165
+ raise SchemaNotFound(f"Schema with global id {global_id} not found")
166
+
167
+ schema_dict = response.json()
168
+
169
+ except Exception as e:
170
+ logger.error(traceback.format_exc())
171
+ raise e
172
+
173
+ # Add a name field to the schema as needed by fastavro
174
+ schema_dict["name"] = "Envelope"
175
+
176
+ # Convert the schema dict to an avro schema object
177
+ avro_schema = parse(orjson.dumps(schema_dict))
178
+
179
+ # Convert the schema dict to a hashable dict
180
+ hashable_dict_schema = Hashabledict(schema_dict)
181
+
182
+ return hashable_dict_schema, avro_schema
183
+
184
+ else:
185
+ raise ValueError(f"Schema registry type {self.config.authentication.schema_registry_type} not supported")
186
+
187
+ def decode_avro(self, message: Message) -> Tuple[dict, dict]:
188
+ """Decode the message as avro and return the parsed message and the schema"""
189
+ global_id, nb_bytes_schema_id = parse_global_id_from_serialized_message(
190
+ message=message.value(),
191
+ )
192
+
193
+ try:
194
+ hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
195
+ except SchemaNotFound as e:
196
+ logger.error(
197
+ (
198
+ f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
199
+ f"message value: {message.value()}."
200
+ )
201
+ )
202
+ logger.error(traceback.format_exc())
203
+ raise e
204
+
205
+ return (
206
+ decode_avro_message(
207
+ message_value=message.value(),
208
+ nb_bytes_schema_id=nb_bytes_schema_id,
209
+ avro_schema=avro_schema,
210
+ ),
211
+ hashable_dict_schema,
212
+ )
213
+
214
+ def decode_utf_8(self, message: Message) -> Tuple[dict, dict]:
215
+ """Decode the message as utf-8 and return the parsed message and the schema"""
216
+ # Decode the message as utf-8
217
+ return orjson.loads(message.value().decode("utf-8")), {}
218
+
219
+ def decode(self, message) -> Tuple[dict, dict]:
220
+ """Decode the message based on the encoding type
221
+ Returns parsed message and the schema
222
+ """
223
+ if self.config.message_encoding == MessageEncoding.AVRO:
224
+ return self.decode_avro(message)
225
+
226
+ elif self.config.message_encoding == MessageEncoding.UTF_8:
227
+ return self.decode_utf_8(message)
228
+
229
+ else:
230
+ raise ValueError(f"Message encoding {self.config.message_encoding} not supported")
231
+
232
+ def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
233
+ """Parse the encoded Kafka messages and return a list of SourceRecord"""
234
+
235
+ records = []
236
+
237
+ for message in encoded_messages:
238
+
239
+ if message.error():
240
+ # If the message is too large, we skip it and update the offset
241
+ if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
242
+ logger.error(
243
+ (
244
+ f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is too large. "
245
+ f"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
246
+ )
247
+ )
248
+
249
+ logger.error(
250
+ (
251
+ f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
252
+ f"{message.error()}"
253
+ )
254
+ )
255
+ raise KafkaException(message.error())
256
+
257
+ # We skip tombstone messages
258
+ if self.config.skip_message_empty_value and not message.value():
259
+ logger.debug(
260
+ f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
261
+ )
262
+ continue
263
+
264
+ # Decode the message
265
+ try:
266
+
267
+ decoded_message, hashable_dict_schema = self.decode(message)
268
+
269
+ data = {
270
+ "topic": message.topic(),
271
+ "offset": message.offset(),
272
+ "partition": message.partition(),
273
+ "timestamp": message.timestamp()[1],
274
+ "keys": orjson.loads(message.key().decode("utf-8")) if message.key() else {},
275
+ "headers": (
276
+ {key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
277
+ ),
278
+ "value": decoded_message,
279
+ "schema": hashable_dict_schema,
280
+ }
281
+
282
+ records.append(
283
+ SourceRecord(
284
+ id=f"partition_{message.partition()}_offset_{message.offset()}",
285
+ timestamp=datetime.fromtimestamp(message.timestamp()[1] / 1000, tz=UTC),
286
+ data=data,
287
+ destination_id=self.topic_map[message.topic()],
288
+ )
289
+ )
290
+
291
+ except Exception as e:
292
+ logger.error(
293
+ (
294
+ f"Error while decoding message for topic {message.topic()} on partition {message.partition()}: {e} at offset {message.offset()} "
295
+ f"with value: {message.value()} and key: {message.key()}"
296
+ )
297
+ )
298
+ # Try to parse error message from the message
299
+ try:
300
+ message_raw_text = message.value().decode("utf-8")
301
+ logger.error(f"Parsed Kafka value: {message_raw_text}")
302
+ except UnicodeDecodeError:
303
+ logger.error("Message is not a valid UTF-8 string")
304
+
305
+ logger.error(traceback.format_exc())
306
+ raise e
307
+
308
+ return records
309
+
310
+ def read_topics_manually(self, pagination: dict = None) -> SourceIteration:
311
+ """Read the topics manually, we use consumer.assign to assign to the partitions and get the offsets"""
312
+
313
+ assert len(self.config.topics) == 1, "Only one topic is supported for manual mode"
314
+
315
+ # We will use the first topic for the manual mode
316
+ topic = self.config.topics[0]
317
+
318
+ nb_partitions = self.get_number_of_partitions(topic=topic.name)
319
+
320
+ # Setup offset_pagination
321
+ self.topic_offsets = (
322
+ TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions(topic=topic.name)
323
+ )
324
+
325
+ self.consumer.assign(
326
+ [
327
+ TopicPartition(topic.name, partition, self.topic_offsets.get_partition_offset(partition))
328
+ for partition in range(nb_partitions)
329
+ ]
330
+ )
331
+
332
+ t1 = datetime.now()
333
+ encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
334
+ logger.info(f"Kafka consumer read : {len(encoded_messages)} messages in {datetime.now() - t1}")
335
+
336
+ records = self.parse_encoded_messages(encoded_messages)
337
+
338
+ # Update the offset for the partition
339
+ if not records:
340
+ logger.info("No new records found, stopping iteration")
341
+ return SourceIteration(
342
+ next_pagination={},
343
+ records=[],
344
+ )
345
+
346
+ # Update the offset for the partition
347
+ self.topic_offsets.set_partition_offset(encoded_messages[-1].partition(), encoded_messages[-1].offset() + 1)
348
+
349
+ return SourceIteration(
350
+ next_pagination=self.topic_offsets.model_dump(),
351
+ records=records,
352
+ )
353
+
354
+ def read_topics_with_subscribe(self, pagination: dict = None) -> SourceIteration:
355
+ """Read the topics with the subscribe method, pagination will not be used
356
+ We rely on Kafka to get assigned to the partitions and get the offsets
357
+ """
358
+ topics = [topic.name for topic in self.config.topics]
359
+ self.consumer.subscribe(topics)
360
+ t1 = datetime.now()
361
+ encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
362
+ logger.info(f"Kafka consumer read : {len(encoded_messages)} messages in {datetime.now() - t1}")
363
+ records = self.parse_encoded_messages(encoded_messages)
364
+ return SourceIteration(
365
+ next_pagination={},
366
+ records=records,
367
+ )
368
+
369
+ def get(self, pagination: dict = None) -> SourceIteration:
370
+ if self.config.sync_mode == SourceSyncModes.STREAM:
371
+ return self.read_topics_with_subscribe(pagination)
372
+ else:
373
+ return self.read_topics_manually(pagination)
374
+
375
+ def commit(self):
376
+ """Commit the offsets of the consumer"""
377
+ try:
378
+ self.consumer.commit(asynchronous=False)
379
+ except CimplKafkaException as e:
380
+ logger.error(f"Kafka exception occurred during commit: {e}")
381
+ logger.info("Gracefully exiting without committing offsets due to Kafka exception")
@@ -0,0 +1,7 @@
1
+ import os
2
+
3
+ from bizon.engine.engine import RunnerFactory
4
+
5
+ if __name__ == "__main__":
6
+ runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("test-pipeline-streaming-v2.yml"))
7
+ runner.run()
@@ -0,0 +1,20 @@
1
+ name: periscope to logger
2
+
3
+ source:
4
+ name: periscope
5
+ stream: charts
6
+ max_iterations: 20
7
+ workspace_name: MY_WORKSPACE_NAME
8
+ client_site_id: 99999
9
+ database_id: 999
10
+ authentication:
11
+ type: cookies
12
+ params:
13
+ cookies:
14
+ periscope_session: ooooooooo
15
+ cf_bm: kkkkkkkk
16
+
17
+ destination:
18
+ name: logger
19
+ config:
20
+ dummy: dummy
@@ -0,0 +1,20 @@
1
+ name: periscope dashboards to logger
2
+
3
+ source:
4
+ name: periscope
5
+ stream: dashboards
6
+ max_iterations: 20
7
+ workspace_name: MY_WORKSPACE_NAME
8
+ client_site_id: 99999
9
+ database_id: 999
10
+ authentication:
11
+ type: cookies
12
+ params:
13
+ cookies:
14
+ periscope_session: ooooooooo
15
+ cf_bm: kkkkkkkk
16
+
17
+ destination:
18
+ name: logger
19
+ config:
20
+ dummy: dummy