bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +33 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +128 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
- bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
- bizon/connectors/destinations/file/src/destination.py +56 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +69 -0
- bizon/connectors/sources/kafka/src/decode.py +93 -0
- bizon/connectors/sources/kafka/src/source.py +381 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +83 -0
- bizon/{destinations → destination}/destination.py +103 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +121 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +113 -24
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +39 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +153 -0
- bizon/monitoring/monitor.py +71 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +30 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
- bizon-0.1.2.dist-info/RECORD +123 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.1.0.dist-info/RECORD +0 -93
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from typing import Any, List, Tuple
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
from requests.auth import AuthBase
|
|
5
|
+
|
|
6
|
+
from bizon.source.auth.builder import AuthBuilder
|
|
7
|
+
from bizon.source.auth.config import AuthType
|
|
8
|
+
from bizon.source.config import SourceConfig
|
|
9
|
+
from bizon.source.models import SourceIteration, SourceRecord
|
|
10
|
+
from bizon.source.source import AbstractSource
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CycleSourceConfig(SourceConfig):
|
|
14
|
+
slug: str = Field(..., description="Slug of the Cycle account")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CycleSource(AbstractSource):
|
|
18
|
+
def __init__(self, config: CycleSourceConfig):
|
|
19
|
+
super().__init__(config)
|
|
20
|
+
self.config: CycleSourceConfig = config
|
|
21
|
+
self.url_graphql = "https://api.product.cycle.app/graphql"
|
|
22
|
+
|
|
23
|
+
def get_authenticator(self) -> AuthBase:
|
|
24
|
+
if self.config.authentication.type.value == AuthType.API_KEY:
|
|
25
|
+
return AuthBuilder.token(params=self.config.authentication.params)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def streams() -> List[str]:
|
|
29
|
+
return ["customers"]
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def get_config_class() -> SourceConfig:
|
|
33
|
+
return CycleSourceConfig
|
|
34
|
+
|
|
35
|
+
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
36
|
+
return True, None
|
|
37
|
+
|
|
38
|
+
def get_total_records_count(self) -> int | None:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def run_graphql_query(self, query: str, variables: dict) -> dict:
|
|
42
|
+
"""Run a graphql query and return the response"""
|
|
43
|
+
|
|
44
|
+
payload = {"query": query, "variables": variables}
|
|
45
|
+
|
|
46
|
+
response = self.session.post(self.url_graphql, json=payload)
|
|
47
|
+
|
|
48
|
+
data = response.json()
|
|
49
|
+
return data
|
|
50
|
+
|
|
51
|
+
def _get_pagination_str(self, pagination: dict) -> str:
|
|
52
|
+
if not pagination:
|
|
53
|
+
pagination_str = """
|
|
54
|
+
size: 100
|
|
55
|
+
"""
|
|
56
|
+
else:
|
|
57
|
+
pagination_str = """
|
|
58
|
+
size: 100
|
|
59
|
+
where: {
|
|
60
|
+
cursor: "PAGINATION_CURSOR"
|
|
61
|
+
direction: AFTER
|
|
62
|
+
}
|
|
63
|
+
""".replace(
|
|
64
|
+
"PAGINATION_CURSOR", pagination.get("endCursor")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return pagination_str
|
|
68
|
+
|
|
69
|
+
def get_customers(self, pagination: dict) -> SourceIteration:
|
|
70
|
+
"""Return all customers for the given slug"""
|
|
71
|
+
|
|
72
|
+
pagination_str = self._get_pagination_str(pagination=pagination)
|
|
73
|
+
|
|
74
|
+
query = """
|
|
75
|
+
query Customers($slug: DefaultString!) {
|
|
76
|
+
getProductBySlug(slug: $slug) {
|
|
77
|
+
customers(pagination: {
|
|
78
|
+
PAGINATION_STRING
|
|
79
|
+
}) {
|
|
80
|
+
edges {
|
|
81
|
+
cursor
|
|
82
|
+
node {
|
|
83
|
+
id
|
|
84
|
+
email
|
|
85
|
+
name
|
|
86
|
+
company {
|
|
87
|
+
domain
|
|
88
|
+
id
|
|
89
|
+
name
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
pageInfo {
|
|
94
|
+
hasPreviousPage
|
|
95
|
+
hasNextPage
|
|
96
|
+
startCursor
|
|
97
|
+
endCursor
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
""".replace(
|
|
103
|
+
"PAGINATION_STRING", pagination_str
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
variables = {"slug": self.config.slug}
|
|
107
|
+
|
|
108
|
+
data = self.run_graphql_query(query, variables)
|
|
109
|
+
|
|
110
|
+
# Parse edges from response
|
|
111
|
+
edges = data.get("data", {}).get("getProductBySlug", {}).get("customers", {}).get("edges", [])
|
|
112
|
+
|
|
113
|
+
records = []
|
|
114
|
+
for customer in edges:
|
|
115
|
+
customer_data = customer.get("node", {})
|
|
116
|
+
records.append(
|
|
117
|
+
SourceRecord(
|
|
118
|
+
id=customer_data["id"],
|
|
119
|
+
data=customer_data,
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Get pagination info from response
|
|
124
|
+
pagination_info = data.get("data", {}).get("getProductBySlug", {}).get("customers", {}).get("pageInfo", {})
|
|
125
|
+
next_pagination = pagination_info if pagination_info.get("hasNextPage") else {}
|
|
126
|
+
|
|
127
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
128
|
+
|
|
129
|
+
def get(self, pagination: dict = None) -> SourceIteration:
|
|
130
|
+
if self.config.stream == "customers":
|
|
131
|
+
return self.get_customers(pagination)
|
|
132
|
+
|
|
133
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for Cycle")
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from bizon.cli.utils import parse_from_yaml
|
|
4
4
|
from bizon.engine.engine import RunnerFactory
|
|
5
5
|
|
|
6
|
-
config = parse_from_yaml(os.path.abspath("bizon/sources/
|
|
6
|
+
config = parse_from_yaml(os.path.abspath("bizon/connectors/sources/cycle/config/cycle.yml"))
|
|
7
7
|
|
|
8
8
|
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
9
9
|
runner.run()
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
# Function emulating an API call to a source endpoint
|
|
2
|
-
def fake_api_call(url: str, cursor: str = None) -> dict:
|
|
5
|
+
def fake_api_call(url: str, cursor: str = None, sleep: int = None) -> dict:
|
|
6
|
+
if sleep:
|
|
7
|
+
time.sleep(sleep)
|
|
3
8
|
if url == "https://api.dummy.com/v1/creatures":
|
|
4
9
|
return fake_api_call_creatures(cursor)
|
|
5
10
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
|
|
1
|
+
import random
|
|
2
|
+
from typing import List, Literal, Tuple, Union
|
|
2
3
|
|
|
3
4
|
from pydantic import Field
|
|
4
5
|
from requests.auth import AuthBase
|
|
@@ -7,7 +8,7 @@ from bizon.source.auth.authenticators.oauth import Oauth2AuthParams
|
|
|
7
8
|
from bizon.source.auth.authenticators.token import TokenAuthParams
|
|
8
9
|
from bizon.source.auth.builder import AuthBuilder
|
|
9
10
|
from bizon.source.auth.config import AuthConfig, AuthType
|
|
10
|
-
from bizon.source.config import SourceConfig
|
|
11
|
+
from bizon.source.config import SourceConfig, SourceSyncModes
|
|
11
12
|
from bizon.source.models import SourceIteration, SourceRecord
|
|
12
13
|
from bizon.source.source import AbstractSource
|
|
13
14
|
|
|
@@ -23,12 +24,14 @@ class DummyAuthConfig(AuthConfig):
|
|
|
23
24
|
|
|
24
25
|
class DummySourceConfig(SourceConfig):
|
|
25
26
|
authentication: DummyAuthConfig
|
|
27
|
+
sleep: int = Field(0, description="Sleep time in seconds between API calls")
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
class DummySource(AbstractSource):
|
|
29
31
|
|
|
30
32
|
def __init__(self, config: DummySourceConfig):
|
|
31
33
|
super().__init__(config)
|
|
34
|
+
self.config = config
|
|
32
35
|
|
|
33
36
|
@staticmethod
|
|
34
37
|
def streams() -> List[str]:
|
|
@@ -40,7 +43,7 @@ class DummySource(AbstractSource):
|
|
|
40
43
|
|
|
41
44
|
@property
|
|
42
45
|
def url_entity(self) -> str:
|
|
43
|
-
return f"https://api.dummy.com/v1/{self.config.
|
|
46
|
+
return f"https://api.dummy.com/v1/{self.config.stream}"
|
|
44
47
|
|
|
45
48
|
def get_authenticator(self) -> AuthBase:
|
|
46
49
|
|
|
@@ -71,11 +74,11 @@ class DummySource(AbstractSource):
|
|
|
71
74
|
|
|
72
75
|
# If no pagination data is passed, we want to reach first page
|
|
73
76
|
if not pagination:
|
|
74
|
-
response = fake_api_call(url=self.url_entity)
|
|
77
|
+
response = fake_api_call(url=self.url_entity, sleep=self.config.sleep)
|
|
75
78
|
|
|
76
79
|
# If we have pagination data we pass it to the API
|
|
77
80
|
else:
|
|
78
|
-
response = fake_api_call(url=self.url_entity, cursor=pagination.get("cursor"))
|
|
81
|
+
response = fake_api_call(url=self.url_entity, cursor=pagination.get("cursor"), sleep=self.config.sleep)
|
|
79
82
|
|
|
80
83
|
# Now we process the response to:
|
|
81
84
|
# - allow bizon to process the records and write them to destination
|
|
@@ -87,6 +90,15 @@ class DummySource(AbstractSource):
|
|
|
87
90
|
|
|
88
91
|
next_pagination = {"cursor": next_cursor} if next_cursor else {}
|
|
89
92
|
|
|
93
|
+
destination_id = None
|
|
94
|
+
|
|
95
|
+
# If we are in streaming mode, we need to get the destination id from the stream name
|
|
96
|
+
if self.config.sync_mode == SourceSyncModes.STREAM:
|
|
97
|
+
if next_pagination.get("cursor") == "final-cursor":
|
|
98
|
+
destination_id = "routed"
|
|
99
|
+
else:
|
|
100
|
+
destination_id = self.config.stream
|
|
101
|
+
|
|
90
102
|
if records:
|
|
91
103
|
return SourceIteration(
|
|
92
104
|
next_pagination=next_pagination,
|
|
@@ -94,6 +106,7 @@ class DummySource(AbstractSource):
|
|
|
94
106
|
SourceRecord(
|
|
95
107
|
id=record["id"],
|
|
96
108
|
data=record,
|
|
109
|
+
destination_id=destination_id,
|
|
97
110
|
)
|
|
98
111
|
for record in records
|
|
99
112
|
],
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from yaml import safe_load
|
|
2
|
+
|
|
3
|
+
from bizon.engine.engine import RunnerFactory
|
|
4
|
+
|
|
5
|
+
config_yaml = """
|
|
6
|
+
name: test_job
|
|
7
|
+
|
|
8
|
+
source:
|
|
9
|
+
name: dummy
|
|
10
|
+
stream: creatures
|
|
11
|
+
authentication:
|
|
12
|
+
type: api_key
|
|
13
|
+
params:
|
|
14
|
+
token: dummy_key
|
|
15
|
+
|
|
16
|
+
destination:
|
|
17
|
+
name: logger
|
|
18
|
+
config:
|
|
19
|
+
dummy: dummy
|
|
20
|
+
|
|
21
|
+
transforms:
|
|
22
|
+
- label: failure_transform
|
|
23
|
+
python: |
|
|
24
|
+
data['cookies'] = data['key_that_does_not_exist'].upper()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
config = safe_load(config_yaml)
|
|
28
|
+
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
29
|
+
runner.run()
|
|
@@ -40,7 +40,7 @@ class HubSpotObjectsSource(HubSpotBaseSource):
|
|
|
40
40
|
def __init__(self, config: HubSpotSourceConfig):
|
|
41
41
|
super().__init__(config)
|
|
42
42
|
self.config: HubSpotSourceConfig = config
|
|
43
|
-
self.object = self.config.
|
|
43
|
+
self.object = self.config.stream
|
|
44
44
|
self.selected_properties = [] # Initialize properties to empty list
|
|
45
45
|
|
|
46
46
|
# If we are initializing the pipeline, we retrieve the selected properties from HubSpot
|
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
name: demo kafka to bigquery
|
|
2
2
|
|
|
3
3
|
source:
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
name: kafka
|
|
5
|
+
stream: topic
|
|
6
6
|
|
|
7
7
|
sync_mode: full_refresh
|
|
8
8
|
|
|
9
9
|
topic: my-topic
|
|
10
10
|
|
|
11
|
-
nb_bytes_schema_id: 8
|
|
12
|
-
|
|
13
11
|
batch_size: 1000
|
|
14
12
|
consumer_timeout: 10
|
|
15
13
|
bootstrap_servers: <bootstrap-severs>:9092
|
|
@@ -47,4 +45,4 @@ destination:
|
|
|
47
45
|
# syncCursorInDBEvery: 100
|
|
48
46
|
|
|
49
47
|
# runner:
|
|
50
|
-
# log_level: INFO
|
|
48
|
+
# log_level: INFO
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
name: Kafka debezium messages to bigquery streaming
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: kafka
|
|
5
|
+
stream: topic
|
|
6
|
+
|
|
7
|
+
sync_mode: full_refresh
|
|
8
|
+
|
|
9
|
+
force_ignore_checkpoint: true
|
|
10
|
+
|
|
11
|
+
topic: <TOPIC_NAME>
|
|
12
|
+
|
|
13
|
+
batch_size: 1000
|
|
14
|
+
consumer_timeout: 10
|
|
15
|
+
bootstrap_servers: <BOOTSTRAP_SERVERS>
|
|
16
|
+
group_id: <GROUP_ID>
|
|
17
|
+
|
|
18
|
+
authentication:
|
|
19
|
+
type: basic
|
|
20
|
+
|
|
21
|
+
schema_registry_url: <SCHEMA_REGISTRY_URL>
|
|
22
|
+
schema_registry_username: <SCHEMA_REGISTRY_USERNAME>
|
|
23
|
+
schema_registry_password: <SCHEMA_REGISTRY_PASSWORD>
|
|
24
|
+
|
|
25
|
+
params:
|
|
26
|
+
username: <USERNAME>
|
|
27
|
+
password: <PASSWORD>
|
|
28
|
+
|
|
29
|
+
destination:
|
|
30
|
+
name: bigquery_streaming
|
|
31
|
+
|
|
32
|
+
config:
|
|
33
|
+
buffer_size: 50
|
|
34
|
+
bq_max_rows_per_request: 10000
|
|
35
|
+
buffer_flush_timeout: 30
|
|
36
|
+
|
|
37
|
+
table_id: <TABLE_ID>
|
|
38
|
+
dataset_id: <DATASET_ID>
|
|
39
|
+
dataset_location: US
|
|
40
|
+
project_id: <PROJECT_ID>
|
|
41
|
+
|
|
42
|
+
unnest: true
|
|
43
|
+
|
|
44
|
+
time_partitioning:
|
|
45
|
+
# Mandatory if unnested
|
|
46
|
+
field: __event_timestamp
|
|
47
|
+
|
|
48
|
+
record_schema:
|
|
49
|
+
- name: account_id
|
|
50
|
+
type: INTEGER
|
|
51
|
+
mode: REQUIRED
|
|
52
|
+
|
|
53
|
+
- name: team_id
|
|
54
|
+
type: INTEGER
|
|
55
|
+
mode: REQUIRED
|
|
56
|
+
|
|
57
|
+
- name: user_id
|
|
58
|
+
type: INTEGER
|
|
59
|
+
mode: REQUIRED
|
|
60
|
+
|
|
61
|
+
- name: __deleted
|
|
62
|
+
type: BOOLEAN
|
|
63
|
+
mode: NULLABLE
|
|
64
|
+
|
|
65
|
+
- name: __cluster
|
|
66
|
+
type: STRING
|
|
67
|
+
mode: NULLABLE
|
|
68
|
+
|
|
69
|
+
- name: __kafka_partition
|
|
70
|
+
type: INTEGER
|
|
71
|
+
mode: NULLABLE
|
|
72
|
+
|
|
73
|
+
- name: __kafka_offset
|
|
74
|
+
type: INTEGER
|
|
75
|
+
mode: NULLABLE
|
|
76
|
+
|
|
77
|
+
- name: __event_timestamp
|
|
78
|
+
type: TIMESTAMP
|
|
79
|
+
mode: NULLABLE
|
|
80
|
+
|
|
81
|
+
transforms:
|
|
82
|
+
- label: debezium
|
|
83
|
+
python: |
|
|
84
|
+
from datetime import datetime
|
|
85
|
+
|
|
86
|
+
cluster = data['value']['source']['name'].replace('_', '-')
|
|
87
|
+
partition = data['partition']
|
|
88
|
+
offset = data['offset']
|
|
89
|
+
|
|
90
|
+
kafka_timestamp = datetime.utcfromtimestamp(data['value']['source']['ts_ms'] / 1000).strftime('%Y-%m-%d %H:%M:%S.%f')
|
|
91
|
+
|
|
92
|
+
deleted = False
|
|
93
|
+
|
|
94
|
+
if data['value']['op'] == 'd':
|
|
95
|
+
data = data['value']['before']
|
|
96
|
+
deleted = True
|
|
97
|
+
else:
|
|
98
|
+
data = data['value']['after']
|
|
99
|
+
|
|
100
|
+
data['__deleted'] = deleted
|
|
101
|
+
data['__cluster'] = cluster
|
|
102
|
+
data['__kafka_partition'] = partition
|
|
103
|
+
data['__kafka_offset'] = offset
|
|
104
|
+
data['__event_timestamp'] = kafka_timestamp
|
|
105
|
+
|
|
106
|
+
engine:
|
|
107
|
+
queue:
|
|
108
|
+
type: python_queue
|
|
109
|
+
config:
|
|
110
|
+
max_nb_messages: 1000000
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
4
|
+
from bizon.source.models import SourceIteration
|
|
5
|
+
|
|
6
|
+
from .config import KafkaSourceConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KafkaSourceCallback(AbstractSourceCallback):
|
|
10
|
+
def __init__(self, config: KafkaSourceConfig):
|
|
11
|
+
super().__init__(config)
|
|
12
|
+
|
|
13
|
+
def on_iterations_written(self, iterations: List[SourceIteration]):
|
|
14
|
+
"""Commit the offsets of the iterations"""
|
|
15
|
+
|
|
16
|
+
# TODO: Implement the callback
|
|
17
|
+
|
|
18
|
+
pass
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, List, Literal, Mapping
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from bizon.source.auth.config import AuthConfig, AuthType
|
|
7
|
+
from bizon.source.config import SourceConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SchemaRegistryType(str, Enum):
|
|
11
|
+
APICURIO = "apicurio"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MessageEncoding(str, Enum):
|
|
15
|
+
UTF_8 = "utf-8"
|
|
16
|
+
AVRO = "avro"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class KafkaAuthConfig(AuthConfig):
|
|
20
|
+
|
|
21
|
+
type: Literal[AuthType.BASIC] = AuthType.BASIC # username and password authentication
|
|
22
|
+
|
|
23
|
+
# Schema registry authentication
|
|
24
|
+
schema_registry_type: SchemaRegistryType = Field(
|
|
25
|
+
default=SchemaRegistryType.APICURIO, description="Schema registry type"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
schema_registry_url: str = Field(default="", description="Schema registry URL with the format ")
|
|
29
|
+
schema_registry_username: str = Field(default="", description="Schema registry username")
|
|
30
|
+
schema_registry_password: str = Field(default="", description="Schema registry password")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def default_kafka_consumer_config():
|
|
34
|
+
return {
|
|
35
|
+
"auto.offset.reset": "earliest",
|
|
36
|
+
"enable.auto.commit": False, # Turn off auto-commit for manual offset handling
|
|
37
|
+
"session.timeout.ms": 45000,
|
|
38
|
+
"security.protocol": "SASL_SSL",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TopicConfig(BaseModel):
|
|
43
|
+
name: str = Field(..., description="Kafka topic name")
|
|
44
|
+
destination_id: str = Field(..., description="Destination id")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class KafkaSourceConfig(SourceConfig):
|
|
48
|
+
|
|
49
|
+
# Mandatory Kafka configuration
|
|
50
|
+
topics: List[TopicConfig] = Field(..., description="Kafka topic, comma separated")
|
|
51
|
+
bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
|
|
52
|
+
group_id: str = Field(default="bizon", description="Kafka group id")
|
|
53
|
+
|
|
54
|
+
skip_message_empty_value: bool = Field(
|
|
55
|
+
default=True, description="Skip messages with empty value (tombstone messages)"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Kafka consumer configuration
|
|
59
|
+
batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
|
|
60
|
+
consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")
|
|
61
|
+
|
|
62
|
+
consumer_config: Mapping[str, Any] = Field(
|
|
63
|
+
default_factory=default_kafka_consumer_config,
|
|
64
|
+
description="Kafka consumer configuration, as described in the confluent-kafka-python documentation",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
message_encoding: str = Field(default=MessageEncoding.AVRO, description="Encoding to use to decode the message")
|
|
68
|
+
|
|
69
|
+
authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")
|