bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +33 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +128 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
  18. bizon/connectors/destinations/file/src/destination.py +56 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +69 -0
  43. bizon/connectors/sources/kafka/src/decode.py +93 -0
  44. bizon/connectors/sources/kafka/src/source.py +381 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +83 -0
  55. bizon/{destinations → destination}/destination.py +103 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +121 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +113 -24
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +39 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +153 -0
  78. bizon/monitoring/monitor.py +71 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +30 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
  90. bizon-0.1.2.dist-info/RECORD +123 -0
  91. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  92. bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon/destinations/config.py +0 -47
  96. bizon/destinations/file/src/destination.py +0 -27
  97. bizon/sources/kafka/src/source.py +0 -357
  98. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  99. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  100. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  101. bizon-0.1.0.dist-info/RECORD +0 -93
  102. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  103. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  104. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  105. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  106. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  107. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  108. /bizon/{destinations → destination}/models.py +0 -0
  109. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  110. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
@@ -1,154 +0,0 @@
1
- import os
2
- import tempfile
3
- from concurrent.futures import ThreadPoolExecutor
4
- from typing import List, Tuple, Type
5
-
6
- import polars as pl
7
- from google.api_core.exceptions import NotFound
8
- from google.cloud import bigquery, bigquery_storage_v1
9
- from google.cloud.bigquery import DatasetReference, TimePartitioning
10
- from google.cloud.bigquery_storage_v1.types import (
11
- AppendRowsRequest,
12
- ProtoRows,
13
- ProtoSchema,
14
- )
15
- from google.protobuf.message import Message
16
-
17
- from bizon.common.models import SyncMetadata
18
- from bizon.destinations.config import NormalizationType
19
- from bizon.destinations.destination import AbstractDestination
20
- from bizon.engine.backend.backend import AbstractBackend
21
-
22
- from .config import BigQueryStreamingConfigDetails
23
- from .proto_utils import get_proto_schema_and_class
24
-
25
-
26
- class BigQueryStreamingDestination(AbstractDestination):
27
-
28
- def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
29
- super().__init__(sync_metadata, config, backend)
30
- self.config: BigQueryStreamingConfigDetails = config
31
-
32
- if config.authentication and config.authentication.service_account_key:
33
- with tempfile.NamedTemporaryFile(delete=False) as temp:
34
- temp.write(config.authentication.service_account_key.encode())
35
- temp_file_path = temp.name
36
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
37
-
38
- self.project_id = config.project_id
39
- self.bq_client = bigquery.Client(project=self.project_id)
40
- self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
41
- self.dataset_id = config.dataset_id
42
- self.dataset_location = config.dataset_location
43
- self.bq_max_rows_per_request = config.bq_max_rows_per_request
44
-
45
- @property
46
- def table_id(self) -> str:
47
- tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
48
- return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
49
-
50
- def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
51
-
52
- # we keep raw data in the column source_data
53
- if self.config.normalization.type == NormalizationType.NONE:
54
- return [
55
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
56
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
57
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
58
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
59
- bigquery.SchemaField(
60
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
61
- ),
62
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
63
- ]
64
-
65
- raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
66
-
67
- def check_connection(self) -> bool:
68
- dataset_ref = DatasetReference(self.project_id, self.dataset_id)
69
-
70
- try:
71
- self.bq_client.get_dataset(dataset_ref)
72
- except NotFound:
73
- dataset = bigquery.Dataset(dataset_ref)
74
- dataset.location = self.dataset_location
75
- dataset = self.bq_client.create_dataset(dataset)
76
- return True
77
-
78
- def append_rows_to_stream(
79
- self,
80
- write_client: bigquery_storage_v1.BigQueryWriteClient,
81
- stream_name: str,
82
- proto_schema: ProtoSchema,
83
- serialized_rows: List[bytes],
84
- ):
85
- request = AppendRowsRequest(
86
- write_stream=stream_name,
87
- proto_rows=AppendRowsRequest.ProtoData(
88
- rows=ProtoRows(serialized_rows=serialized_rows),
89
- writer_schema=proto_schema,
90
- ),
91
- )
92
- response = write_client.append_rows(iter([request]))
93
- return response.code().name
94
-
95
- @staticmethod
96
- def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
97
- """Convert a row to a protobuf serialization"""
98
- record = TableRowClass()
99
- record._bizon_id = row["bizon_id"]
100
- record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
101
- record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
102
- record._source_record_id = row["source_record_id"]
103
- record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
104
- record._source_data = row["source_data"]
105
- return record.SerializeToString()
106
-
107
- def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
108
- # TODO: for now no clustering keys
109
- clustering_keys = []
110
-
111
- # Create table if it doesnt exist
112
- schema = self.get_bigquery_schema()
113
- table = bigquery.Table(self.table_id, schema=schema)
114
- time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
115
- table.time_partitioning = time_partitioning
116
-
117
- table = self.bq_client.create_table(table, exists_ok=True)
118
-
119
- # Create the stream
120
- write_client = self.bq_storage_client
121
- tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
122
- parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
123
- stream_name = f"{parent}/_default"
124
-
125
- # Generating the protocol buffer representation of the message descriptor.
126
- proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
127
-
128
- serialized_rows = [
129
- self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
130
- for row in df_destination_records.iter_rows(named=True)
131
- ]
132
-
133
- results = []
134
- with ThreadPoolExecutor() as executor:
135
- futures = [
136
- executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
137
- for batch_rows in self.batch(serialized_rows)
138
- ]
139
- for future in futures:
140
- results.append(future.result())
141
-
142
- assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
143
-
144
- def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
145
- self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
146
- return True, ""
147
-
148
- def batch(self, iterable):
149
- """
150
- Yield successive batches of size `batch_size` from `iterable`.
151
- """
152
-
153
- for i in range(0, len(iterable), self.bq_max_rows_per_request):
154
- yield iterable[i : i + self.bq_max_rows_per_request] # noqa
@@ -1,47 +0,0 @@
1
- from enum import Enum
2
- from typing import Optional
3
-
4
- from pydantic import BaseModel, ConfigDict, Field
5
-
6
-
7
- class DestinationTypes(str, Enum):
8
- BIGQUERY = "bigquery"
9
- BIGQUERY_STREAMING = "bigquery_streaming"
10
- LOGGER = "logger"
11
- FILE = "file"
12
-
13
-
14
- class NormalizationType(str, Enum):
15
- TABULAR = "tabular" # Parse key / value pairs to columns
16
- NONE = "none" # No normalization, raw data is stored
17
- DEBEZIUM = "debezium" # Debezium normalization
18
-
19
-
20
- class NormalizationConfig(BaseModel):
21
- type: NormalizationType = Field(description="Normalization type")
22
-
23
-
24
- class AbstractDestinationDetailsConfig(BaseModel):
25
- buffer_size: int = Field(
26
- default=50,
27
- description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
28
- )
29
- buffer_flush_timeout: int = Field(
30
- default=600,
31
- description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
32
- )
33
- normalization: Optional[NormalizationConfig] = Field(
34
- description="Normalization configuration, by default no normalization is applied",
35
- default=NormalizationConfig(type=NormalizationType.NONE),
36
- )
37
- authentication: Optional[BaseModel] = Field(
38
- description="Authentication configuration for the destination, if needed", default=None
39
- )
40
-
41
-
42
- class AbstractDestinationConfig(BaseModel):
43
- # Forbid extra keys in the model
44
- model_config = ConfigDict(extra="forbid")
45
-
46
- name: DestinationTypes = Field(..., description="Name of the destination")
47
- config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
@@ -1,27 +0,0 @@
1
- from typing import Tuple
2
-
3
- import polars as pl
4
- from loguru import logger
5
-
6
- from bizon.common.models import SyncMetadata
7
- from bizon.destinations.destination import AbstractDestination
8
- from bizon.engine.backend.backend import AbstractBackend
9
-
10
- from .config import FileDestinationDetailsConfig
11
-
12
-
13
- class FileDestination(AbstractDestination):
14
-
15
- def __init__(self, sync_metadata: SyncMetadata, config: FileDestinationDetailsConfig, backend: AbstractBackend):
16
- super().__init__(sync_metadata, config, backend)
17
- self.config: FileDestinationDetailsConfig = config
18
-
19
- def check_connection(self) -> bool:
20
- return True
21
-
22
- def delete_table(self) -> bool:
23
- return True
24
-
25
- def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
26
- df_destination_records.write_ndjson(self.config.filepath)
27
- return True, ""
@@ -1,357 +0,0 @@
1
- import io
2
- import json
3
- import logging
4
- import struct
5
- import traceback
6
- from datetime import datetime
7
- from enum import Enum
8
- from functools import lru_cache
9
- from typing import Any, List, Literal, Mapping, Tuple
10
-
11
- import fastavro
12
- from avro.schema import Schema, parse
13
- from confluent_kafka import Consumer, KafkaException, TopicPartition
14
- from loguru import logger
15
- from pydantic import BaseModel, Field
16
- from pytz import UTC
17
- from requests.exceptions import HTTPError
18
-
19
- from bizon.source.auth.config import AuthConfig, AuthType
20
- from bizon.source.config import SourceConfig
21
- from bizon.source.models import SourceIteration, SourceRecord
22
- from bizon.source.source import AbstractSource
23
-
24
- silent_logger = logging.getLogger()
25
- silent_logger.addHandler(logging.StreamHandler())
26
-
27
-
28
- class ApicurioSchemaNotFound(Exception):
29
- pass
30
-
31
-
32
- class SchemaRegistryType(str, Enum):
33
- APICURIO = "apicurio"
34
-
35
-
36
- class KafkaAuthConfig(AuthConfig):
37
-
38
- type: Literal[AuthType.BASIC] = AuthType.BASIC # username and password authentication
39
-
40
- # Schema registry authentication
41
- schema_registry_type: SchemaRegistryType = Field(
42
- default=SchemaRegistryType.APICURIO, description="Schema registry type"
43
- )
44
-
45
- schema_registry_url: str = Field(default="", description="Schema registry URL with the format ")
46
- schema_registry_username: str = Field(default="", description="Schema registry username")
47
- schema_registry_password: str = Field(default="", description="Schema registry password")
48
-
49
-
50
- def default_kafka_consumer_config():
51
- return {
52
- "auto.offset.reset": "earliest",
53
- "enable.auto.commit": False, # Turn off auto-commit for manual offset handling
54
- "session.timeout.ms": 45000,
55
- "security.protocol": "SASL_SSL",
56
- }
57
-
58
-
59
- class KafkaSourceConfig(SourceConfig):
60
-
61
- # Mandatory Kafka configuration
62
- topic: str = Field(..., description="Kafka topic")
63
- bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
64
- group_id: str = Field(default="bizon", description="Kafka group id")
65
-
66
- skip_message_empty_value: bool = Field(
67
- default=True, description="Skip messages with empty value (tombstone messages)"
68
- )
69
-
70
- # Kafka consumer configuration
71
- batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
72
- consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")
73
-
74
- consumer_config: Mapping[str, Any] = Field(
75
- default_factory=default_kafka_consumer_config,
76
- description="Kafka consumer configuration, as described in the confluent-kafka-python documentation",
77
- )
78
-
79
- # Schema ID header configuration
80
- nb_bytes_schema_id: Literal[4, 8] = Field(
81
- description="Number of bytes encode SchemaID in Kafka message. Standard is 4.",
82
- default=4,
83
- )
84
-
85
- authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")
86
-
87
-
88
- class OffsetPartition(BaseModel):
89
- first: int
90
- last: int
91
- to_fetch: int = 0
92
-
93
-
94
- class TopicOffsets(BaseModel):
95
- name: str
96
- partitions: Mapping[int, OffsetPartition]
97
-
98
- def set_partition_offset(self, index: int, offset: int):
99
- self.partitions[index].to_fetch = offset
100
-
101
- def get_partition_offset(self, index: int) -> int:
102
- return self.partitions[index].to_fetch
103
-
104
- @property
105
- def total_offset(self) -> int:
106
- return sum([partition.last for partition in self.partitions.values()])
107
-
108
-
109
- class KafkaSource(AbstractSource):
110
-
111
- def __init__(self, config: KafkaSourceConfig):
112
- super().__init__(config)
113
-
114
- self.config: KafkaSourceConfig = config
115
-
116
- # Kafka consumer configuration
117
- if self.config.authentication.type == AuthType.BASIC:
118
- self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
119
- self.config.consumer_config["sasl.username"] = self.config.authentication.params.username
120
- self.config.consumer_config["sasl.password"] = self.config.authentication.params.password
121
-
122
- # Set the bootstrap servers and group id
123
- self.config.consumer_config["group.id"] = self.config.group_id
124
- self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
125
-
126
- # Consumer instance
127
- self.consumer = Consumer(self.config.consumer_config, logger=silent_logger)
128
-
129
- @staticmethod
130
- def streams() -> List[str]:
131
- return ["topic"]
132
-
133
- def get_authenticator(self):
134
- # We don't use HTTP authentication for Kafka
135
- # We use confluence_kafka library to authenticate
136
- pass
137
-
138
- @staticmethod
139
- def get_config_class() -> AbstractSource:
140
- return KafkaSourceConfig
141
-
142
- def check_connection(self) -> Tuple[bool | Any | None]:
143
- """Check the connection to the Kafka source"""
144
-
145
- logger.info(f"Found: {len(self.consumer.list_topics().topics)} topics")
146
-
147
- topics = self.consumer.list_topics().topics
148
-
149
- if self.config.topic not in topics:
150
- logger.error(f"Topic {self.config.topic} not found, available topics: {topics.keys()}")
151
- return False, f"Topic {self.config.topic} not found"
152
-
153
- logger.info(f"Topic {self.config.topic} has {len(topics[self.config.topic].partitions)} partitions")
154
-
155
- return True, None
156
-
157
- def get_number_of_partitions(self) -> int:
158
- """Get the number of partitions for the topic"""
159
- return len(self.consumer.list_topics().topics[self.config.topic].partitions)
160
-
161
- def get_offset_partitions(self) -> TopicOffsets:
162
- """Get the offsets for each partition of the topic"""
163
-
164
- partitions: Mapping[int, OffsetPartition] = {}
165
-
166
- for i in range(self.get_number_of_partitions()):
167
- offsets = self.consumer.get_watermark_offsets(TopicPartition(self.config.topic, i))
168
- partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
169
-
170
- return TopicOffsets(name=self.config.topic, partitions=partitions)
171
-
172
- def get_total_records_count(self) -> int | None:
173
- """Get the total number of records in the topic, sum of offsets for each partition"""
174
- # Init the consumer
175
- return self.get_offset_partitions().total_offset
176
-
177
- def parse_global_id_from_serialized_message(self, header_message: bytes) -> int:
178
- """Parse the global id from the serialized message"""
179
-
180
- if self.config.nb_bytes_schema_id == 8:
181
- return struct.unpack(">bq", header_message)[1]
182
-
183
- if self.config.nb_bytes_schema_id == 4:
184
- return struct.unpack(">I", header_message)[0]
185
-
186
- raise ValueError(f"Number of bytes for schema id {self.config.nb_bytes_schema_id} not supported")
187
-
188
- def get_apicurio_schema(self, global_id: int) -> dict:
189
- """Get the schema from the Apicurio schema registry"""
190
-
191
- if self.config.authentication.schema_registry_type == SchemaRegistryType.APICURIO:
192
-
193
- try:
194
- response = self.session.get(
195
- f"{self.config.authentication.schema_registry_url}/apis/registry/v2/ids/globalIds/{global_id}",
196
- auth=(
197
- self.config.authentication.schema_registry_username,
198
- self.config.authentication.schema_registry_password,
199
- ),
200
- )
201
-
202
- except HTTPError as e:
203
- if e.response.status_code == 404:
204
- raise ApicurioSchemaNotFound(f"Schema with global id {global_id} not found")
205
-
206
- return response.json()
207
-
208
- else:
209
- raise NotImplementedError(
210
- f"Schema registry of type {self.config.authentication.schema_registry_type} not supported"
211
- )
212
-
213
- def get_parsed_avro_schema(self, global_id: int) -> Schema:
214
- """Parse the schema from the Apicurio schema registry"""
215
- schema = self.get_apicurio_schema(global_id)
216
- schema["name"] = "Envelope"
217
- return parse(json.dumps(schema))
218
-
219
- def decode(self, msg_value, schema):
220
- message_bytes = io.BytesIO(msg_value)
221
- message_bytes.seek(self.config.nb_bytes_schema_id + 1)
222
- event_dict = fastavro.schemaless_reader(message_bytes, schema)
223
- return event_dict
224
-
225
- @lru_cache(maxsize=None)
226
- def get_message_schema(self, header_message: bytes) -> dict:
227
- """Get the global id of the schema for the topic"""
228
- global_id = self.parse_global_id_from_serialized_message(header_message)
229
- return self.get_parsed_avro_schema(global_id).to_json()
230
-
231
- def get_header_bytes(self, message: bytes) -> bytes:
232
- if self.config.nb_bytes_schema_id == 8:
233
- return message[:9]
234
-
235
- elif self.config.nb_bytes_schema_id == 4:
236
- return message[1:5]
237
-
238
- else:
239
- raise ValueError(f"Number of bytes for schema id {self.config.nb_bytes_schema_id} not supported")
240
-
241
- def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
242
-
243
- records = []
244
-
245
- for message in encoded_messages:
246
-
247
- if message.error():
248
- logger.error(
249
- (
250
- f"Error while consuming message for partition {message.partition()} and offset {message.offset()}: "
251
- f"{message.error()}"
252
- )
253
- )
254
- raise KafkaException(message.error())
255
-
256
- # We skip tombstone messages
257
- if self.config.skip_message_empty_value and not message.value():
258
- logger.debug(
259
- f"Message for partition {message.partition()} and offset {message.offset()} is empty, skipping."
260
- )
261
- continue
262
-
263
- # Get the schema for the message
264
- try:
265
- header_message_bytes = self.get_header_bytes(message.value())
266
- schema = self.get_message_schema(header_message_bytes)
267
-
268
- except ApicurioSchemaNotFound as e:
269
- message_schema_id = self.parse_global_id_from_serialized_message(header_message_bytes)
270
- logger.error(
271
- (
272
- f"Message on partition {message.partition()} at offset {message.offset()} has a SchemaID of {message_schema_id} which is not found in Registry."
273
- f"message value: {message.value()}."
274
- )
275
- )
276
- logger.error(traceback.format_exc())
277
- raise e
278
-
279
- except Exception as e:
280
- logger.error(traceback.format_exc())
281
- raise e
282
-
283
- # Decode the message
284
- try:
285
-
286
- data = {
287
- "offset": message.offset(),
288
- "partition": message.partition(),
289
- "timestamp": message.timestamp()[1],
290
- "value": self.decode(message.value(), schema),
291
- "key": message.key().decode("utf-8"),
292
- }
293
-
294
- records.append(
295
- SourceRecord(
296
- id=f"partition_{message.partition()}_offset_{message.offset()}",
297
- timestamp=datetime.fromtimestamp(message.timestamp()[1] / 1000, tz=UTC),
298
- data=data,
299
- )
300
- )
301
-
302
- # Update the offset for the partition
303
- self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
304
-
305
- except Exception as e:
306
- logger.error(
307
- (
308
- f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()} "
309
- f"with value: {message.value()} and key: {message.key()}"
310
- )
311
- )
312
- # Try to parse error message from the message
313
- try:
314
- message_raw_text = message.value().decode("utf-8")
315
- logger.error(f"Parsed Kafka value: {message_raw_text}")
316
- except UnicodeDecodeError:
317
- logger.error("Message is not a valid UTF-8 string")
318
-
319
- logger.error(traceback.format_exc())
320
- raise e
321
-
322
- return records
323
-
324
- def read_topic(self, pagination: dict = None) -> SourceIteration:
325
- nb_partitions = self.get_number_of_partitions()
326
-
327
- # Setup offset_pagination
328
- self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
329
-
330
- self.consumer.assign(
331
- [
332
- TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
333
- for partition in range(nb_partitions)
334
- ]
335
- )
336
-
337
- t1 = datetime.now()
338
- encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
339
- logger.info(f"Kafka consumer read : {len(encoded_messages)} messages in {datetime.now() - t1}")
340
-
341
- records = self.parse_encoded_messages(encoded_messages)
342
-
343
- # Update the offset for the partition
344
- if not records:
345
- logger.info("No new records found, stopping iteration")
346
- return SourceIteration(
347
- next_pagination={},
348
- records=[],
349
- )
350
-
351
- return SourceIteration(
352
- next_pagination=self.topic_offsets.model_dump(),
353
- records=records,
354
- )
355
-
356
- def get(self, pagination: dict = None) -> SourceIteration:
357
- return self.read_topic(pagination)
@@ -1,9 +0,0 @@
1
- import os
2
-
3
- from bizon.engine.engine import RunnerFactory
4
-
5
- if __name__ == "__main__":
6
- runner = RunnerFactory.create_from_yaml(
7
- filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west3_86c1.yml")
8
- )
9
- runner.run()
@@ -1,26 +0,0 @@
1
- source:
2
- source_name: periscope
3
- stream_name: charts
4
- max_iterations: 20
5
- workspace_name: MY_WORKSPACE_NAME
6
- client_site_id: 99999
7
- database_id: 999
8
- authentication:
9
- type: cookies
10
- params:
11
- cookies:
12
- periscope_session: ooooooooo
13
- cf_bm: kkkkkkkk
14
-
15
- backend:
16
- type: sqlite_in_memory
17
- database: NOT_USED_WITH_SQLITE
18
- schema: NOT_USED_WITH_SQLITE
19
- syncCursorInDBEvery: 2
20
-
21
- consumer:
22
- enabled: true
23
- name: bigquery
24
-
25
- pipeline:
26
- log_level: DEBUG
@@ -1,26 +0,0 @@
1
- source:
2
- source_name: periscope
3
- stream_name: dashboards
4
- max_iterations: 20
5
- workspace_name: MY_WORKSPACE_NAME
6
- client_site_id: 99999
7
- database_id: 999
8
- authentication:
9
- type: cookies
10
- params:
11
- cookies:
12
- periscope_session: ooooooooo
13
- cf_bm: kkkkkkkk
14
-
15
- backend:
16
- type: sqlite_in_memory
17
- database: NOT_USED_WITH_SQLITE
18
- schema: NOT_USED_WITH_SQLITE
19
- syncCursorInDBEvery: 2
20
-
21
- consumer:
22
- enabled: true
23
- name: bigquery
24
-
25
- pipeline:
26
- log_level: DEBUG