bizon 0.0.13__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,16 +3,17 @@ import json
3
3
  import logging
4
4
  import struct
5
5
  import traceback
6
- from datetime import datetime, timezone
6
+ from datetime import datetime
7
7
  from enum import Enum
8
8
  from functools import lru_cache
9
- from typing import Any, List, Literal, Mapping, Optional, Tuple
9
+ from typing import Any, List, Literal, Mapping, Tuple
10
10
 
11
11
  import fastavro
12
12
  from avro.schema import Schema, parse
13
- from confluent_kafka import Consumer, KafkaError, KafkaException, TopicPartition
13
+ from confluent_kafka import Consumer, KafkaException, TopicPartition
14
14
  from loguru import logger
15
15
  from pydantic import BaseModel, Field
16
+ from pytz import UTC
16
17
  from requests.exceptions import HTTPError
17
18
 
18
19
  from bizon.source.auth.config import AuthConfig, AuthType
@@ -30,7 +31,6 @@ class ApicurioSchemaNotFound(Exception):
30
31
 
31
32
  class SchemaRegistryType(str, Enum):
32
33
  APICURIO = "apicurio"
33
- CONFLUENT = "confluent"
34
34
 
35
35
 
36
36
  class KafkaAuthConfig(AuthConfig):
@@ -47,22 +47,40 @@ class KafkaAuthConfig(AuthConfig):
47
47
  schema_registry_password: str = Field(default="", description="Schema registry password")
48
48
 
49
49
 
50
+ def default_kafka_consumer_config():
51
+ return {
52
+ "auto.offset.reset": "earliest",
53
+ "enable.auto.commit": False, # Turn off auto-commit for manual offset handling
54
+ "session.timeout.ms": 45000,
55
+ "security.protocol": "SASL_SSL",
56
+ }
57
+
58
+
50
59
  class KafkaSourceConfig(SourceConfig):
60
+
61
+ # Mandatory Kafka configuration
51
62
  topic: str = Field(..., description="Kafka topic")
52
- bootstrap_server: str = Field(..., description="Kafka bootstrap servers")
63
+ bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
64
+ group_id: str = Field(default="bizon", description="Kafka group id")
65
+
66
+ skip_message_empty_value: bool = Field(
67
+ default=True, description="Skip messages with empty value (tombstone messages)"
68
+ )
53
69
 
54
70
  # Kafka consumer configuration
55
- batch_size: int = Field(100, description="Kafka batch size")
56
- consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds")
57
- fetch_max_size: int = Field(500000000, description="Kafka fetch max size in bytes")
58
- receive_message_max_size: int = Field(1000000000, description="Kafka receive message max size in bytes")
71
+ batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
72
+ consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")
59
73
 
60
- group_id: str = Field("bizon", description="Kafka group id")
74
+ consumer_config: Mapping[str, Any] = Field(
75
+ default_factory=default_kafka_consumer_config,
76
+ description="Kafka consumer configuration, as described in the confluent-kafka-python documentation",
77
+ )
61
78
 
79
+ # Schema ID header configuration
62
80
  nb_bytes_schema_id: Literal[4, 8] = Field(
63
- 4, description="Number of bytes for the schema id. 4 is the default for majority of the cases"
81
+ description="Number of bytes encode SchemaID in Kafka message. Standard is 4.",
82
+ default=4,
64
83
  )
65
- timestamp_ms_name: Optional[str] = Field(default="", description="Name of the timestamp field in the Avro schema")
66
84
 
67
85
  authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")
68
86
 
@@ -95,28 +113,18 @@ class KafkaSource(AbstractSource):
95
113
 
96
114
  self.config: KafkaSourceConfig = config
97
115
 
98
- self.parse_timestamp: bool = self.config.timestamp_ms_name != ""
99
-
100
- self.kafka_consumer_conf = {
101
- "bootstrap.servers": self.config.bootstrap_server,
102
- "group.id": self.config.group_id,
103
- "sasl.username": self.config.authentication.params.username,
104
- "sasl.password": self.config.authentication.params.password,
105
- "security.protocol": "SASL_SSL",
106
- "sasl.mechanisms": "PLAIN",
107
- "session.timeout.ms": 45000,
108
- "auto.offset.reset": "earliest",
109
- "enable.auto.commit": False, # Turn off auto-commit for manual offset handling
110
- # Increase the max fetch / receive message size
111
- "fetch.max.bytes": self.config.fetch_max_size,
112
- "receive.message.max.bytes": self.config.receive_message_max_size,
113
- }
116
+ # Kafka consumer configuration
117
+ if self.config.authentication.type == AuthType.BASIC:
118
+ self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
119
+ self.config.consumer_config["sasl.username"] = self.config.authentication.params.username
120
+ self.config.consumer_config["sasl.password"] = self.config.authentication.params.password
114
121
 
115
- # Consumer instance
116
- self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
122
+ # Set the bootstrap servers and group id
123
+ self.config.consumer_config["group.id"] = self.config.group_id
124
+ self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
117
125
 
118
- # Consumers for each worker thread
119
- self.consumers_cached: Mapping[int, Consumer] = {}
126
+ # Consumer instance
127
+ self.consumer = Consumer(self.config.consumer_config, logger=silent_logger)
120
128
 
121
129
  @staticmethod
122
130
  def streams() -> List[str]:
@@ -234,29 +242,21 @@ class KafkaSource(AbstractSource):
234
242
 
235
243
  records = []
236
244
 
237
- # Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
238
- source_timestamp = datetime.now(tz=timezone.utc)
239
-
240
245
  for message in encoded_messages:
241
246
 
242
247
  if message.error():
243
248
  logger.error(
244
- f"Error while consuming message for partition {message.partition()} and offset {message.offset()}: {message.error()}"
245
- )
246
-
247
- # Skip the message if it's too large
248
- if message.error() == KafkaError.MSG_SIZE_TOO_LARGE:
249
- logger.error(
250
- f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is too large."
249
+ (
250
+ f"Error while consuming message for partition {message.partition()} and offset {message.offset()}: "
251
+ f"{message.error()}"
251
252
  )
252
- # Define what we want to do (?)
253
- # self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
254
-
253
+ )
255
254
  raise KafkaException(message.error())
256
255
 
257
- if not message.value():
256
+ # We skip tombstone messages
257
+ if self.config.skip_message_empty_value and not message.value():
258
258
  logger.debug(
259
- f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
259
+ f"Message for partition {message.partition()} and offset {message.offset()} is empty, skipping."
260
260
  )
261
261
  continue
262
262
 
@@ -282,27 +282,26 @@ class KafkaSource(AbstractSource):
282
282
 
283
283
  # Decode the message
284
284
  try:
285
- data = self.decode(message.value(), schema)
286
-
287
- # Add the message key to the data
288
- data["_bizon_message_key"] = message.key().decode("utf-8")
289
285
 
290
- # Get the source timestamp
291
- if self.parse_timestamp:
292
- source_timestamp = datetime.fromtimestamp(
293
- data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
294
- )
295
-
296
- self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
286
+ data = {
287
+ "offset": message.offset(),
288
+ "partition": message.partition(),
289
+ "timestamp": message.timestamp()[1],
290
+ "value": self.decode(message.value(), schema),
291
+ "key": message.key().decode("utf-8"),
292
+ }
297
293
 
298
294
  records.append(
299
295
  SourceRecord(
300
- id=f"part_{message.partition()}_offset_{message.offset()}",
301
- timestamp=source_timestamp,
296
+ id=f"partition_{message.partition()}_offset_{message.offset()}",
297
+ timestamp=datetime.fromtimestamp(message.timestamp()[1] / 1000, tz=UTC),
302
298
  data=data,
303
299
  )
304
300
  )
305
301
 
302
+ # Update the offset for the partition
303
+ self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
304
+
306
305
  except Exception as e:
307
306
  logger.error(
308
307
  (
@@ -337,7 +336,7 @@ class KafkaSource(AbstractSource):
337
336
 
338
337
  t1 = datetime.now()
339
338
  encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
340
- logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
339
+ logger.info(f"Kafka consumer read : {len(encoded_messages)} messages in {datetime.now() - t1}")
341
340
 
342
341
  records = self.parse_encoded_messages(encoded_messages)
343
342
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.13
3
+ Version: 0.0.14
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -79,7 +79,7 @@ bizon/sources/hubspot/src/hubspot_objects.py,sha256=EmABx9XD8q6g4Uc5mHLv5YYl5KcI
79
79
  bizon/sources/hubspot/src/models/hs_object.py,sha256=-Y20H3-nenJyySMlvM4TPttPz4O8qm3ArKP_I8pxsuo,1235
80
80
  bizon/sources/hubspot/tests/hubspot_pipeline.py,sha256=e6dCF5_MHMySkeiF6kKrSAuCa_48J22-ZeSCZSjrfUI,216
81
81
  bizon/sources/kafka/config/kafka.example.yml,sha256=ZyHBmSWZ_5WQaBr9WzD05PuE6vi3hhYgHh2VZ-IU-Iw,755
82
- bizon/sources/kafka/src/source.py,sha256=VpFMevigrQl-ab8bCWjBQB-fe_y_zGDjZXk98tFA_j8,13873
82
+ bizon/sources/kafka/src/source.py,sha256=Wlbnnz0HoT1DEyiYV0mPGOUVakFF3ep-4vRooPM55XA,13427
83
83
  bizon/sources/kafka/tests/kafka_pipeline.py,sha256=Pg8fvQUKnJYOiriNkUO9DaAVp2_rXh50kVPUH2MGx44,255
84
84
  bizon/sources/periscope/config/periscope_charts.example.yml,sha256=rpFDAWeU5oZ3UOiX0sSAgd1X5lv6t-s3iqiDPnRqutU,477
85
85
  bizon/sources/periscope/config/periscope_dashboards.example.yml,sha256=sN2iGGqCQCvrMXcwxNGq_dR7-KZ1KtYdXmNYKXlfEpg,481
@@ -87,8 +87,8 @@ bizon/sources/periscope/src/source.py,sha256=AZM-HDDjdTWj8akeeofQ_-G8YlnNHEKi2mj
87
87
  bizon/sources/periscope/tests/periscope_pipeline_charts.py,sha256=mU0JtfhS1KmWsS3iovGhGxK7iPVWiYzjBM_QfRL3ZQI,275
88
88
  bizon/sources/periscope/tests/periscope_pipeline_dashboard.py,sha256=vZKN7UfH-lQIWrnfjPqQFjZm28UIw2m9OSg4yS-Wckk,279
89
89
  bizon/utils.py,sha256=HXaPiyxpWKoy3XN5vSYOve1ezlFeOYin3aFqTjcabUQ,81
90
- bizon-0.0.13.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
91
- bizon-0.0.13.dist-info/METADATA,sha256=uTC-3H3zIWwziFXtik8LDZHm5pr3gR_IkRoc39IYhQo,5650
92
- bizon-0.0.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
93
- bizon-0.0.13.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
94
- bizon-0.0.13.dist-info/RECORD,,
90
+ bizon-0.0.14.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
91
+ bizon-0.0.14.dist-info/METADATA,sha256=Y2qhRY-6BlUwNt0emuiax_vr3mD-9xUQ0XRe-VGFs8M,5650
92
+ bizon-0.0.14.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
93
+ bizon-0.0.14.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
94
+ bizon-0.0.14.dist-info/RECORD,,
File without changes