bizon 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {bizon-0.1.0 → bizon-0.1.1}/PKG-INFO +22 -6
  2. {bizon-0.1.0 → bizon-0.1.1}/README.md +15 -4
  3. bizon-0.1.1/bizon/alerting/alerts.py +23 -0
  4. bizon-0.1.1/bizon/alerting/models.py +28 -0
  5. bizon-0.1.1/bizon/alerting/slack/__init__.py +0 -0
  6. bizon-0.1.1/bizon/alerting/slack/config.py +5 -0
  7. bizon-0.1.1/bizon/alerting/slack/handler.py +39 -0
  8. bizon-0.1.1/bizon/cli/__init__.py +0 -0
  9. {bizon-0.1.0 → bizon-0.1.1}/bizon/cli/main.py +7 -3
  10. {bizon-0.1.0 → bizon-0.1.1}/bizon/common/models.py +31 -7
  11. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/destinations/bigquery/config/bigquery.example.yml +3 -4
  12. bizon-0.1.1/bizon/connectors/destinations/bigquery/src/config.py +127 -0
  13. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/destinations/bigquery/src/destination.py +46 -25
  14. bizon-0.1.1/bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
  15. bizon-0.1.1/bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
  16. bizon-0.1.1/bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
  17. bizon-0.1.1/bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
  18. {bizon-0.1.0/bizon/destinations/bigquery_streaming → bizon-0.1.1/bizon/connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
  19. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/destinations/file/src/config.py +8 -3
  20. bizon-0.1.1/bizon/connectors/destinations/file/src/destination.py +54 -0
  21. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/destinations/logger/src/config.py +1 -1
  22. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/destinations/logger/src/destination.py +15 -3
  23. bizon-0.1.1/bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  24. bizon-0.1.1/bizon/connectors/sources/cycle/src/source.py +133 -0
  25. bizon-0.1.0/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py → bizon-0.1.1/bizon/connectors/sources/cycle/tests/cycle_customers.py +1 -1
  26. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/config/dummy.example.yml +2 -2
  27. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/src/fake_api.py +6 -1
  28. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/src/source.py +18 -5
  29. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline.py +2 -2
  30. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  31. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline_kafka.py +2 -2
  32. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  33. bizon-0.1.1/bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  34. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  35. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  36. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/gsheets/config/default_auth.example.yml +4 -2
  37. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/gsheets/config/service_account.example.yml +4 -2
  38. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/config/api_key.example.yml +4 -2
  39. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/config/oauth.example.yml +4 -2
  40. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/src/hubspot_objects.py +1 -1
  41. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/kafka/config/kafka.example.yml +2 -2
  42. bizon-0.1.1/bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
  43. bizon-0.1.1/bizon/connectors/sources/kafka/src/callback.py +18 -0
  44. bizon-0.1.1/bizon/connectors/sources/kafka/src/config.py +75 -0
  45. bizon-0.1.1/bizon/connectors/sources/kafka/src/decode.py +88 -0
  46. bizon-0.1.1/bizon/connectors/sources/kafka/src/source.py +361 -0
  47. bizon-0.1.1/bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  48. bizon-0.1.1/bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  49. bizon-0.1.1/bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  50. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/periscope/src/source.py +137 -13
  51. bizon-0.1.1/bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  52. bizon-0.1.1/bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  53. bizon-0.1.1/bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  54. bizon-0.1.1/bizon/connectors/sources/pokeapi/src/source.py +79 -0
  55. {bizon-0.1.0/bizon/destinations → bizon-0.1.1/bizon/destination}/buffer.py +5 -0
  56. bizon-0.1.1/bizon/destination/config.py +74 -0
  57. {bizon-0.1.0/bizon/destinations → bizon-0.1.1/bizon/destination}/destination.py +71 -15
  58. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  59. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/engine.py +20 -1
  60. bizon-0.1.1/bizon/engine/pipeline/consumer.py +83 -0
  61. bizon-0.1.1/bizon/engine/pipeline/models.py +15 -0
  62. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/pipeline/producer.py +18 -9
  63. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  64. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/kafka/queue.py +3 -2
  65. bizon-0.1.1/bizon/engine/queue/adapters/python_queue/consumer.py +53 -0
  66. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  67. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  68. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  69. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/config.py +16 -0
  70. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/queue.py +17 -16
  71. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/runner/adapters/process.py +15 -2
  72. bizon-0.1.1/bizon/engine/runner/adapters/streaming.py +103 -0
  73. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/runner/adapters/thread.py +32 -9
  74. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/runner/config.py +28 -0
  75. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/runner/runner.py +107 -25
  76. bizon-0.1.1/bizon/monitoring/__init__.py +0 -0
  77. bizon-0.1.1/bizon/monitoring/config.py +29 -0
  78. bizon-0.1.1/bizon/monitoring/datadog/__init__.py +0 -0
  79. bizon-0.1.1/bizon/monitoring/datadog/monitor.py +69 -0
  80. bizon-0.1.1/bizon/monitoring/monitor.py +42 -0
  81. bizon-0.1.1/bizon/monitoring/noop/__init__.py +0 -0
  82. bizon-0.1.1/bizon/monitoring/noop/monitor.py +11 -0
  83. bizon-0.1.1/bizon/source/callback.py +24 -0
  84. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/config.py +3 -3
  85. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/cursor.py +1 -1
  86. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/discover.py +4 -3
  87. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/models.py +4 -2
  88. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/source.py +10 -2
  89. bizon-0.1.1/bizon/transform/config.py +8 -0
  90. bizon-0.1.1/bizon/transform/transform.py +48 -0
  91. {bizon-0.1.0 → bizon-0.1.1}/pyproject.toml +9 -1
  92. bizon-0.1.0/bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon-0.1.0/bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon-0.1.0/bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon-0.1.0/bizon/destinations/config.py +0 -47
  96. bizon-0.1.0/bizon/destinations/file/src/destination.py +0 -27
  97. bizon-0.1.0/bizon/engine/pipeline/consumer.py +0 -15
  98. bizon-0.1.0/bizon/engine/pipeline/models.py +0 -10
  99. bizon-0.1.0/bizon/engine/queue/adapters/python_queue/consumer.py +0 -36
  100. bizon-0.1.0/bizon/sources/kafka/src/source.py +0 -357
  101. bizon-0.1.0/bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  102. bizon-0.1.0/bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  103. bizon-0.1.0/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  104. {bizon-0.1.0 → bizon-0.1.1}/LICENSE +0 -0
  105. {bizon-0.1.0 → bizon-0.1.1}/bizon/__main__.py +0 -0
  106. {bizon-0.1.0/bizon/cli → bizon-0.1.1/bizon/alerting}/__init__.py +0 -0
  107. {bizon-0.1.0 → bizon-0.1.1}/bizon/cli/utils.py +0 -0
  108. {bizon-0.1.0 → bizon-0.1.1}/bizon/common/errors/backoff.py +0 -0
  109. {bizon-0.1.0 → bizon-0.1.1}/bizon/common/errors/errors.py +0 -0
  110. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/gsheets/src/source.py +0 -0
  111. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  112. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/src/hubspot_base.py +0 -0
  113. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/src/models/hs_object.py +0 -0
  114. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/hubspot/tests/hubspot_pipeline.py +0 -0
  115. {bizon-0.1.0/bizon → bizon-0.1.1/bizon/connectors}/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  116. {bizon-0.1.0/bizon/destinations → bizon-0.1.1/bizon/destination}/models.py +0 -0
  117. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  118. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/backend/backend.py +0 -0
  119. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/backend/config.py +0 -0
  120. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/backend/models.py +0 -0
  121. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/config.py +0 -0
  122. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  123. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
  124. {bizon-0.1.0 → bizon-0.1.1}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  125. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  126. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  127. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/basic.py +0 -0
  128. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/cookies.py +0 -0
  129. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/oauth.py +0 -0
  130. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/authenticators/token.py +0 -0
  131. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/builder.py +0 -0
  132. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/auth/config.py +0 -0
  133. {bizon-0.1.0 → bizon-0.1.1}/bizon/source/session.py +0 -0
  134. {bizon-0.1.0 → bizon-0.1.1}/bizon/utils.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: bizon
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Classifier: Programming Language :: Python :: 3.12
13
13
  Provides-Extra: bigquery
14
+ Provides-Extra: datadog
14
15
  Provides-Extra: gsheets
15
16
  Provides-Extra: kafka
16
17
  Provides-Extra: postgres
@@ -19,6 +20,7 @@ Requires-Dist: avro (>=1.12.0,<2.0.0) ; extra == "kafka"
19
20
  Requires-Dist: backoff (>=2.2.1,<3.0.0)
20
21
  Requires-Dist: click (>=8.1.7,<9.0.0)
21
22
  Requires-Dist: confluent-kafka (>=2.6.0,<3.0.0) ; extra == "kafka"
23
+ Requires-Dist: datadog (>=0.50.2,<0.51.0) ; extra == "datadog"
22
24
  Requires-Dist: dpath (>=2.2.0,<3.0.0)
23
25
  Requires-Dist: fastavro (>=1.9.7,<2.0.0) ; extra == "kafka"
24
26
  Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0) ; extra == "bigquery"
@@ -27,6 +29,7 @@ Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
27
29
  Requires-Dist: gspread (>=6.1.2,<7.0.0) ; extra == "gsheets"
28
30
  Requires-Dist: kafka-python (>=2.0.2,<3.0.0) ; extra == "kafka"
29
31
  Requires-Dist: loguru (>=0.7.2,<0.8.0)
32
+ Requires-Dist: orjson (>=3.10.16,<4.0.0)
30
33
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
31
34
  Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
32
35
  Requires-Dist: polars (>=1.16.0,<2.0.0)
@@ -39,8 +42,10 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
39
42
  Requires-Dist: pytz (>=2024.2,<2025.0)
40
43
  Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
41
44
  Requires-Dist: requests (>=2.28.2,<3.0.0)
45
+ Requires-Dist: simplejson (>=3.20.1,<4.0.0)
42
46
  Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
43
47
  Requires-Dist: sqlalchemy-bigquery (>=1.11.0,<2.0.0) ; extra == "bigquery"
48
+ Requires-Dist: tenacity (>=9.0.0,<10.0.0)
44
49
  Description-Content-Type: text/markdown
45
50
 
46
51
  # bizon ⚡️
@@ -48,13 +53,17 @@ Extract and load your largest data streams with a framework you can trust for bi
48
53
 
49
54
  ## Features
50
55
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
56
+
51
57
  - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
58
+
52
59
  - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
53
- - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
60
+
61
+ - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement Datadog & OpenTelemetry for tracing. You can monitor:
54
62
  - ETAs for completion
55
63
  - Number of records processed
56
64
  - Completion percentage
57
65
  - Latency Source <> Destination
66
+
58
67
  - **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
59
68
  - `requests` for HTTP requests
60
69
  - `pyyaml` for configuration
@@ -83,8 +92,8 @@ Create a file named `config.yml` in your working directory with the following co
83
92
  name: demo-creatures-pipeline
84
93
 
85
94
  source:
86
- source_name: dummy
87
- stream_name: creatures
95
+ name: dummy
96
+ stream: creatures
88
97
  authentication:
89
98
  type: api_key
90
99
  params:
@@ -105,7 +114,7 @@ bizon run config.yml
105
114
 
106
115
  Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
107
116
  - `sqlite`: In-memory SQLite database, useful for testing and development.
108
- - `biguquery`: Google BigQuery backend, perfect for light setup & production.
117
+ - `bigquery`: Google BigQuery backend, perfect for light setup & production.
109
118
  - `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
110
119
 
111
120
  ## Queue configuration
@@ -115,6 +124,13 @@ Queue is the interface used by Bizon to exchange data between `Source` and `Dest
115
124
  - `rabbitmq`: RabbitMQ, for production use and high throughput.
116
125
  - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
117
126
 
127
+ ## Runner configuration
128
+
129
+ Runner is the interface used by Bizon to run the pipeline. It can be configured in the `runner` section of the configuration file. The following runners are supported:
130
+ - `thread` (asynchronous)
131
+ - `process` (asynchronous)
132
+ - `stream` (synchronous)
133
+
118
134
  ## Start syncing your data 🚀
119
135
 
120
136
  ### Quick setup without any dependencies ✌️
@@ -3,13 +3,17 @@ Extract and load your largest data streams with a framework you can trust for bi
3
3
 
4
4
  ## Features
5
5
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
6
+
6
7
  - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
8
+
7
9
  - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
8
- - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
10
+
11
+ - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement Datadog & OpenTelemetry for tracing. You can monitor:
9
12
  - ETAs for completion
10
13
  - Number of records processed
11
14
  - Completion percentage
12
15
  - Latency Source <> Destination
16
+
13
17
  - **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
14
18
  - `requests` for HTTP requests
15
19
  - `pyyaml` for configuration
@@ -38,8 +42,8 @@ Create a file named `config.yml` in your working directory with the following co
38
42
  name: demo-creatures-pipeline
39
43
 
40
44
  source:
41
- source_name: dummy
42
- stream_name: creatures
45
+ name: dummy
46
+ stream: creatures
43
47
  authentication:
44
48
  type: api_key
45
49
  params:
@@ -60,7 +64,7 @@ bizon run config.yml
60
64
 
61
65
  Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
62
66
  - `sqlite`: In-memory SQLite database, useful for testing and development.
63
- - `biguquery`: Google BigQuery backend, perfect for light setup & production.
67
+ - `bigquery`: Google BigQuery backend, perfect for light setup & production.
64
68
  - `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
65
69
 
66
70
  ## Queue configuration
@@ -70,6 +74,13 @@ Queue is the interface used by Bizon to exchange data between `Source` and `Dest
70
74
  - `rabbitmq`: RabbitMQ, for production use and high throughput.
71
75
  - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
72
76
 
77
+ ## Runner configuration
78
+
79
+ Runner is the interface used by Bizon to run the pipeline. It can be configured in the `runner` section of the configuration file. The following runners are supported:
80
+ - `thread` (asynchronous)
81
+ - `process` (asynchronous)
82
+ - `stream` (synchronous)
83
+
73
84
  ## Start syncing your data 🚀
74
85
 
75
86
  ### Quick setup without any dependencies ✌️
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List
3
+
4
+ from loguru import logger
5
+
6
+ from bizon.alerting.models import AlertingConfig, AlertMethod, LogLevel
7
+
8
+
9
+ class AbstractAlert(ABC):
10
+
11
+ def __init__(self, type: AlertMethod, config: AlertingConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
12
+ self.type = type
13
+ self.config = config
14
+ self.log_levels = log_levels
15
+
16
+ @abstractmethod
17
+ def handler(self, message: Dict) -> None:
18
+ pass
19
+
20
+ def add_handlers(self) -> None:
21
+ levels = [level.value for level in self.log_levels]
22
+ for level in levels:
23
+ logger.add(self.handler, level=level, format="{message}")
@@ -0,0 +1,28 @@
1
+ from enum import Enum
2
+ from typing import List, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from bizon.alerting.slack.config import SlackConfig
7
+
8
+
9
+ class LogLevel(str, Enum):
10
+ DEBUG = "DEBUG"
11
+ INFO = "INFO"
12
+ WARNING = "WARNING"
13
+ ERROR = "ERROR"
14
+ CRITICAL = "CRITICAL"
15
+
16
+
17
+ class AlertMethod(str, Enum):
18
+ """Alerting methods"""
19
+
20
+ SLACK = "slack"
21
+
22
+
23
+ class AlertingConfig(BaseModel):
24
+ """Alerting configuration model"""
25
+
26
+ type: AlertMethod
27
+ log_levels: Optional[List[LogLevel]] = [LogLevel.ERROR]
28
+ config: Union[SlackConfig]
File without changes
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class SlackConfig(BaseModel):
5
+ webhook_url: str
@@ -0,0 +1,39 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import requests
5
+ from loguru import logger
6
+
7
+ from bizon.alerting.alerts import AbstractAlert, AlertMethod
8
+ from bizon.alerting.models import LogLevel
9
+ from bizon.alerting.slack.config import SlackConfig
10
+
11
+
12
+ class SlackHandler(AbstractAlert):
13
+ def __init__(self, config: SlackConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
14
+ super().__init__(type=AlertMethod.SLACK, config=config, log_levels=log_levels)
15
+ self.webhook_url = config.webhook_url
16
+
17
+ def handler(self, message: Dict) -> None:
18
+ """
19
+ Custom handler to send error logs to Slack, with additional context.
20
+ """
21
+ log_entry = message.record
22
+ error_message = (
23
+ f"*Sync*: `{os.environ.get('BIZON_SYNC_NAME', 'N/A')}`\n"
24
+ f"*Source*: `{os.environ.get('BIZON_SOURCE_NAME', 'N/A')}` - `{os.environ.get('BIZON_SOURCE_STREAM', 'N/A')}`\n" # noqa
25
+ f"*Destination*: `{os.environ.get('BIZON_DESTINATION_NAME', 'N/A')}`\n\n"
26
+ f"*Message:*\n```{log_entry['message']}```\n"
27
+ f"*File:* `{log_entry['file'].path}:{log_entry['line']}`\n"
28
+ f"*Function:* `{log_entry['function']}`\n"
29
+ f"*Level:* `{log_entry['level'].name}`\n"
30
+ )
31
+
32
+ payload = {"text": f":rotating_light: *Bizon Pipeline Alert* :rotating_light:\n\n{error_message}"}
33
+
34
+ try:
35
+ response = requests.post(self.webhook_url, json=payload)
36
+ response.raise_for_status()
37
+ except requests.exceptions.RequestException as e:
38
+ logger.error(f"Failed to send log to Slack: {e}")
39
+ return None
File without changes
@@ -83,7 +83,7 @@ def destination():
83
83
  @click.option(
84
84
  "--runner",
85
85
  required=False,
86
- type=click.Choice(["thread", "process"]),
86
+ type=click.Choice(["thread", "process", "stream"]),
87
87
  default="thread",
88
88
  show_default=True,
89
89
  help="Runner type to use. Thread or Process.",
@@ -117,9 +117,13 @@ def run(
117
117
  set_runner_in_config(config=config, runner=runner)
118
118
 
119
119
  runner = RunnerFactory.create_from_config_dict(config=config)
120
- runner.run()
120
+ result = runner.run()
121
121
 
122
- click.echo("Pipeline finished.")
122
+ if result.is_success:
123
+ click.secho("Pipeline finished successfully.", fg="green")
124
+
125
+ else:
126
+ raise click.exceptions.ClickException(result.to_string())
123
127
 
124
128
 
125
129
  if __name__ == "__main__":
@@ -1,13 +1,21 @@
1
- from typing import Union
1
+ from typing import Optional, Union
2
2
 
3
3
  from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
- from bizon.destinations.bigquery.src.config import BigQueryConfig
6
- from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
7
- from bizon.destinations.file.src.config import FileDestinationConfig
8
- from bizon.destinations.logger.src.config import LoggerConfig
5
+ from bizon.alerting.models import AlertingConfig
6
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryConfig
7
+ from bizon.connectors.destinations.bigquery_streaming.src.config import (
8
+ BigQueryStreamingConfig,
9
+ )
10
+ from bizon.connectors.destinations.bigquery_streaming_v2.src.config import (
11
+ BigQueryStreamingV2Config,
12
+ )
13
+ from bizon.connectors.destinations.file.src.config import FileDestinationConfig
14
+ from bizon.connectors.destinations.logger.src.config import LoggerConfig
9
15
  from bizon.engine.config import EngineConfig
16
+ from bizon.monitoring.config import MonitoringConfig
10
17
  from bizon.source.config import SourceConfig, SourceSyncModes
18
+ from bizon.transform.config import TransformModel
11
19
 
12
20
 
13
21
  class BizonConfig(BaseModel):
@@ -23,9 +31,15 @@ class BizonConfig(BaseModel):
23
31
  default=...,
24
32
  )
25
33
 
34
+ transforms: Optional[list[TransformModel]] = Field(
35
+ description="List of transformations to apply to the source data",
36
+ default=[],
37
+ )
38
+
26
39
  destination: Union[
27
40
  BigQueryConfig,
28
41
  BigQueryStreamingConfig,
42
+ BigQueryStreamingV2Config,
29
43
  LoggerConfig,
30
44
  FileDestinationConfig,
31
45
  ] = Field(
@@ -39,6 +53,16 @@ class BizonConfig(BaseModel):
39
53
  default=EngineConfig(),
40
54
  )
41
55
 
56
+ alerting: Optional[AlertingConfig] = Field(
57
+ description="Alerting configuration",
58
+ default=None,
59
+ )
60
+
61
+ monitoring: Optional[MonitoringConfig] = Field(
62
+ description="Monitoring configuration",
63
+ default=None,
64
+ )
65
+
42
66
 
43
67
  class SyncMetadata(BaseModel):
44
68
  """Model which stores general metadata around a sync.
@@ -57,8 +81,8 @@ class SyncMetadata(BaseModel):
57
81
  return cls(
58
82
  name=config.name,
59
83
  job_id=job_id,
60
- source_name=config.source.source_name,
61
- stream_name=config.source.stream_name,
84
+ source_name=config.source.name,
85
+ stream_name=config.source.stream,
62
86
  sync_mode=config.source.sync_mode,
63
87
  destination_name=config.destination.name,
64
88
  )
@@ -1,6 +1,8 @@
1
+ name: hubspot contacts to bigquery
2
+
1
3
  source:
2
4
  name: hubspot
3
- stream_name: contacts
5
+ stream: contacts
4
6
  properties:
5
7
  strategy: all
6
8
  authentication:
@@ -34,6 +36,3 @@ destination:
34
36
  "client_x509_cert_url": "",
35
37
  "universe_domain": "googleapis.com"
36
38
  }
37
-
38
- pipeline:
39
- log_level: DEBUG
@@ -0,0 +1,127 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ import polars as pl
5
+ from pydantic import BaseModel, Field
6
+
7
+ from bizon.destination.config import (
8
+ AbstractDestinationConfig,
9
+ AbstractDestinationDetailsConfig,
10
+ DestinationColumn,
11
+ DestinationTypes,
12
+ )
13
+
14
+
15
+ class GCSBufferFormat(str, Enum):
16
+ PARQUET = "parquet"
17
+ CSV = "csv"
18
+
19
+
20
+ class TimePartitioning(str, Enum):
21
+ DAY = "DAY"
22
+ HOUR = "HOUR"
23
+ MONTH = "MONTH"
24
+ YEAR = "YEAR"
25
+
26
+
27
+ class BigQueryColumnType(str, Enum):
28
+ BOOLEAN = "BOOLEAN"
29
+ BYTES = "BYTES"
30
+ DATE = "DATE"
31
+ DATETIME = "DATETIME"
32
+ FLOAT = "FLOAT"
33
+ FLOAT64 = "FLOAT64"
34
+ GEOGRAPHY = "GEOGRAPHY"
35
+ INTEGER = "INTEGER"
36
+ INT64 = "INT64"
37
+ NUMERIC = "NUMERIC"
38
+ BIGNUMERIC = "BIGNUMERIC"
39
+ JSON = "JSON"
40
+ RECORD = "RECORD"
41
+ STRING = "STRING"
42
+ TIME = "TIME"
43
+ TIMESTAMP = "TIMESTAMP"
44
+
45
+
46
+ class BigQueryColumnMode(str, Enum):
47
+ NULLABLE = "NULLABLE"
48
+ REQUIRED = "REQUIRED"
49
+ REPEATED = "REPEATED"
50
+
51
+
52
+ BIGQUERY_TO_POLARS_TYPE_MAPPING = {
53
+ "STRING": pl.String,
54
+ "BYTES": pl.Binary,
55
+ "INTEGER": pl.Int64,
56
+ "INT64": pl.Int64,
57
+ "FLOAT": pl.Float64,
58
+ "FLOAT64": pl.Float64,
59
+ "NUMERIC": pl.Float64, # Can be refined for precision with Decimal128 if needed
60
+ "BIGNUMERIC": pl.Float64, # Similar to NUMERIC
61
+ "BOOLEAN": pl.Boolean,
62
+ "BOOL": pl.Boolean,
63
+ "TIMESTAMP": pl.String, # We use BigQuery internal parsing to convert to datetime
64
+ "DATE": pl.String, # We use BigQuery internal parsing to convert to datetime
65
+ "DATETIME": pl.String, # We use BigQuery internal parsing to convert to datetime
66
+ "TIME": pl.Time,
67
+ "GEOGRAPHY": pl.Object, # Polars doesn't natively support geography types
68
+ "ARRAY": pl.List, # Requires additional handling for element types
69
+ "JSON": pl.String,
70
+ }
71
+
72
+
73
+ class BigQueryColumn(DestinationColumn):
74
+ name: str = Field(..., description="Name of the column")
75
+ type: BigQueryColumnType = Field(..., description="Type of the column")
76
+ mode: BigQueryColumnMode = Field(..., description="Mode of the column")
77
+ description: Optional[str] = Field(None, description="Description of the column")
78
+ default_value_expression: Optional[str] = Field(None, description="Default value expression")
79
+
80
+ @property
81
+ def polars_type(self):
82
+ return BIGQUERY_TO_POLARS_TYPE_MAPPING.get(self.type.upper())
83
+
84
+
85
+ class BigQueryAuthentication(BaseModel):
86
+ service_account_key: str = Field(
87
+ description="Service Account Key JSON string. If empty it will be infered",
88
+ default="",
89
+ )
90
+
91
+
92
+ class BigQueryRecordSchemaConfig(BaseModel):
93
+ destination_id: str = Field(..., description="Destination ID")
94
+ record_schema: list[BigQueryColumn] = Field(..., description="Record schema")
95
+
96
+ # BigQuery Clustering Keys
97
+ clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys")
98
+
99
+
100
+ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
101
+
102
+ # Table details
103
+ project_id: str = Field(..., description="BigQuery Project ID")
104
+ dataset_id: str = Field(..., description="BigQuery Dataset ID")
105
+ dataset_location: str = Field(default="US", description="BigQuery Dataset location")
106
+
107
+ # GCS Buffer
108
+ gcs_buffer_bucket: str = Field(..., description="GCS Buffer bucket")
109
+ gcs_buffer_format: GCSBufferFormat = Field(default=GCSBufferFormat.PARQUET, description="GCS Buffer format")
110
+
111
+ # Time partitioning
112
+ time_partitioning: TimePartitioning = Field(
113
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
114
+ )
115
+
116
+ # Schema for unnesting
117
+ record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
118
+ default=None, description="Schema for the records. Required if unnest is set to true."
119
+ )
120
+
121
+ authentication: Optional[BigQueryAuthentication] = None
122
+
123
+
124
+ class BigQueryConfig(AbstractDestinationConfig):
125
+ name: Literal[DestinationTypes.BIGQUERY]
126
+ buffer_size: Optional[int] = 400
127
+ config: BigQueryConfigDetails
@@ -1,5 +1,4 @@
1
1
  import io
2
- import json
3
2
  import os
4
3
  import tempfile
5
4
  import traceback
@@ -13,18 +12,24 @@ from google.cloud.bigquery import DatasetReference, TimePartitioning
13
12
  from loguru import logger
14
13
 
15
14
  from bizon.common.models import SyncMetadata
16
- from bizon.destinations.config import NormalizationType
17
- from bizon.destinations.destination import AbstractDestination
15
+ from bizon.destination.destination import AbstractDestination
18
16
  from bizon.engine.backend.backend import AbstractBackend
19
17
  from bizon.source.config import SourceSyncModes
18
+ from bizon.source.source import AbstractSourceCallback
20
19
 
21
- from .config import BigQueryConfigDetails
20
+ from .config import BigQueryColumn, BigQueryConfigDetails
22
21
 
23
22
 
24
23
  class BigQueryDestination(AbstractDestination):
25
24
 
26
- def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
27
- super().__init__(sync_metadata, config, backend)
25
+ def __init__(
26
+ self,
27
+ sync_metadata: SyncMetadata,
28
+ config: BigQueryConfigDetails,
29
+ backend: AbstractBackend,
30
+ source_callback: AbstractSourceCallback,
31
+ ):
32
+ super().__init__(sync_metadata, config, backend, source_callback)
28
33
  self.config: BigQueryConfigDetails = config
29
34
 
30
35
  if config.authentication and config.authentication.service_account_key:
@@ -44,7 +49,7 @@ class BigQueryDestination(AbstractDestination):
44
49
 
45
50
  @property
46
51
  def table_id(self) -> str:
47
- tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
52
+ tabled_id = self.destination_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
48
53
  return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
49
54
 
50
55
  @property
@@ -61,28 +66,24 @@ class BigQueryDestination(AbstractDestination):
61
66
 
62
67
  def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
63
68
 
64
- # we keep raw data in the column source_data
65
- if self.config.normalization.type == NormalizationType.NONE:
69
+ # Case we unnest the data
70
+ if self.config.unnest:
66
71
  return [
67
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
68
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
69
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
70
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
71
72
  bigquery.SchemaField(
72
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
73
- ),
74
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
73
+ col.name,
74
+ col.type,
75
+ mode=col.mode,
76
+ description=col.description,
77
+ )
78
+ for col in self.record_schemas[self.destination_id]
75
79
  ]
76
80
 
77
- # If normalization is tabular, we parse key / value pairs to columns
78
- elif self.config.normalization.type == NormalizationType.TABULAR:
79
-
80
- # We use the first record to infer the schema of tabular data (key / value pairs)
81
- source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
82
-
83
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
81
+ # Case we don't unnest the data
82
+ else:
83
+ return [
84
84
  bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
85
85
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
86
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
86
87
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
87
88
  bigquery.SchemaField(
88
89
  "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
@@ -90,8 +91,6 @@ class BigQueryDestination(AbstractDestination):
90
91
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
91
92
  ]
92
93
 
93
- raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
94
-
95
94
  def check_connection(self) -> bool:
96
95
  dataset_ref = DatasetReference(self.project_id, self.dataset_id)
97
96
 
@@ -129,6 +128,28 @@ class BigQueryDestination(AbstractDestination):
129
128
 
130
129
  raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
131
130
 
131
+ @staticmethod
132
+ def unnest_data(df_destination_records: pl.DataFrame, record_schema: list[BigQueryColumn]) -> pl.DataFrame:
133
+ """Unnest the source_data field into separate columns"""
134
+
135
+ # Check if the schema matches the expected schema
136
+ source_data_fields = (
137
+ pl.DataFrame(df_destination_records["source_data"].str.json_decode(infer_schema_length=None))
138
+ .schema["source_data"]
139
+ .fields
140
+ )
141
+
142
+ record_schema_fields = [col.name for col in record_schema]
143
+
144
+ for field in source_data_fields:
145
+ assert field.name in record_schema_fields, f"Column {field.name} not found in BigQuery schema"
146
+
147
+ # Parse the JSON and unnest the fields to polar type
148
+ return df_destination_records.select(
149
+ pl.col("source_data").str.json_path_match(f"$.{col.name}").cast(col.polars_type).alias(col.name)
150
+ for col in record_schema
151
+ )
152
+
132
153
  def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
133
154
 
134
155
  # We always partition by the loaded_at field
@@ -0,0 +1,56 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
7
+ from bizon.destination.config import (
8
+ AbstractDestinationConfig,
9
+ AbstractDestinationDetailsConfig,
10
+ DestinationTypes,
11
+ )
12
+
13
+
14
+ class TimePartitioningWindow(str, Enum):
15
+ DAY = "DAY"
16
+ HOUR = "HOUR"
17
+ MONTH = "MONTH"
18
+ YEAR = "YEAR"
19
+
20
+
21
+ class TimePartitioning(BaseModel):
22
+ type: TimePartitioningWindow = Field(default=TimePartitioningWindow.DAY, description="Time partitioning type")
23
+ field: Optional[str] = Field(
24
+ "_bizon_loaded_at", description="Field to partition by. You can use a transformation to create this field."
25
+ )
26
+
27
+
28
+ class BigQueryAuthentication(BaseModel):
29
+ service_account_key: str = Field(
30
+ description="Service Account Key JSON string. If empty it will be infered",
31
+ default="",
32
+ )
33
+
34
+
35
+ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
36
+ project_id: str
37
+ dataset_id: str
38
+ dataset_location: Optional[str] = "US"
39
+ time_partitioning: Optional[TimePartitioning] = Field(
40
+ default=TimePartitioning(type=TimePartitioningWindow.DAY, field="_bizon_loaded_at"),
41
+ description="BigQuery Time partitioning type",
42
+ )
43
+ authentication: Optional[BigQueryAuthentication] = None
44
+ bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
45
+ record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
46
+ default=None, description="Schema for the records. Required if unnest is set to true."
47
+ )
48
+ use_legacy_streaming_api: bool = Field(
49
+ default=False,
50
+ description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
51
+ )
52
+
53
+
54
+ class BigQueryStreamingConfig(AbstractDestinationConfig):
55
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING]
56
+ config: BigQueryStreamingConfigDetails