bizon 0.0.4.dev2__tar.gz → 0.0.4.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/PKG-INFO +13 -3
  2. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/README.md +12 -2
  3. bizon-0.0.4.dev4/bizon/__main__.py +4 -0
  4. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/main.py +17 -10
  5. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/utils.py +11 -5
  6. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/config/bigquery.example.yml +2 -0
  7. bizon-0.0.4.dev4/bizon/destinations/buffer.py +76 -0
  8. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/config.py +8 -1
  9. bizon-0.0.4.dev4/bizon/destinations/destination.py +266 -0
  10. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/file/src/destination.py +1 -3
  11. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/models.py +0 -9
  12. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/adapters/sqlalchemy/backend.py +29 -35
  13. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/backend.py +15 -24
  14. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/models.py +5 -3
  15. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/config.py +1 -1
  16. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/engine.py +3 -3
  17. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/producer.py +14 -11
  18. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/consumer.py +2 -0
  19. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -2
  20. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/consumer.py +2 -0
  21. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/queue.py +8 -1
  22. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/adapters/process.py +3 -4
  23. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/adapters/thread.py +24 -6
  24. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/config.py +11 -2
  25. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/runner.py +16 -9
  26. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/discover.py +92 -81
  27. bizon-0.0.4.dev4/bizon/sources/hubspot/src/hubspot_base.py +89 -0
  28. bizon-0.0.4.dev2/bizon/sources/hubspot/src/source_objects.py → bizon-0.0.4.dev4/bizon/sources/hubspot/src/hubspot_objects.py +4 -81
  29. bizon-0.0.4.dev4/bizon/sources/hubspot/tests/hubspot_pipeline.py +7 -0
  30. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/src/source.py +8 -6
  31. bizon-0.0.4.dev4/bizon/utils.py +3 -0
  32. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/pyproject.toml +1 -1
  33. bizon-0.0.4.dev2/bizon/destinations/buffer.py +0 -49
  34. bizon-0.0.4.dev2/bizon/destinations/destination.py +0 -208
  35. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_benchmark.py +0 -26
  36. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_client.py +0 -25
  37. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_iteration.py +0 -30
  38. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -6
  39. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/LICENSE +0 -0
  40. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/__init__.py +0 -0
  41. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/errors/backoff.py +0 -0
  42. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/errors/errors.py +0 -0
  43. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/models.py +0 -0
  44. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/src/config.py +0 -0
  45. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/src/destination.py +0 -0
  46. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/tests/test_bigquery_client.py +0 -0
  47. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/file/src/config.py +0 -0
  48. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/logger/src/config.py +0 -0
  49. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/logger/src/destination.py +0 -0
  50. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  51. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/config.py +0 -0
  52. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/consumer.py +0 -0
  53. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/models.py +0 -0
  54. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  55. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
  56. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
  57. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
  58. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  59. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
  60. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/config.py +0 -0
  61. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/models.py +0 -0
  62. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  63. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  64. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/basic.py +0 -0
  65. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/cookies.py +0 -0
  66. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/oauth.py +0 -0
  67. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/token.py +0 -0
  68. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/builder.py +0 -0
  69. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/config.py +0 -0
  70. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/config.py +0 -0
  71. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/cursor.py +0 -0
  72. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/models.py +0 -0
  73. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/session.py +0 -0
  74. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/source.py +0 -0
  75. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/config/api_key.example.yml +0 -0
  76. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
  77. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/src/fake_api.py +0 -0
  78. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/src/source.py +0 -0
  79. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
  80. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
  81. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
  82. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
  83. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
  84. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
  85. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
  86. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
  87. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/src/source.py +0 -0
  88. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  89. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
  90. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
  91. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
  92. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
  93. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
  94. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  95. {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.4.dev2
3
+ Version: 0.0.4.dev4
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -45,7 +45,8 @@ Extract and load your largest data streams with a framework you can trust for bi
45
45
 
46
46
  ## Features
47
47
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
48
- - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
48
+ - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
49
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
49
50
  - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
50
51
  - ETAs for completion
51
52
  - Number of records processed
@@ -64,6 +65,14 @@ pip install bizon
64
65
 
65
66
  ## Usage
66
67
 
68
+ ### List available sources and streams
69
+ ```bash
70
+ bizon source list
71
+ bizon stream list <source_name>
72
+ ```
73
+
74
+ ### Create a pipeline
75
+
67
76
  Create a file named `config.yml` in your working directory with the following content:
68
77
 
69
78
  ```yaml
@@ -97,7 +106,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
97
106
 
98
107
  Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
99
108
  - `python_queue`: Python Queue, useful for testing and development.
100
- - `kafka`: Apache Kafka, for production use and high throughput.
109
+ - `rabbitmq`: RabbitMQ, for production use and high throughput.
110
+ - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
101
111
 
102
112
  ## Start syncing your data 🚀
103
113
 
@@ -3,7 +3,8 @@ Extract and load your largest data streams with a framework you can trust for bi
3
3
 
4
4
  ## Features
5
5
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
6
- - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
6
+ - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
7
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
7
8
  - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
8
9
  - ETAs for completion
9
10
  - Number of records processed
@@ -22,6 +23,14 @@ pip install bizon
22
23
 
23
24
  ## Usage
24
25
 
26
+ ### List available sources and streams
27
+ ```bash
28
+ bizon source list
29
+ bizon stream list <source_name>
30
+ ```
31
+
32
+ ### Create a pipeline
33
+
25
34
  Create a file named `config.yml` in your working directory with the following content:
26
35
 
27
36
  ```yaml
@@ -55,7 +64,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
55
64
 
56
65
  Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
57
66
  - `python_queue`: Python Queue, useful for testing and development.
58
- - `kafka`: Apache Kafka, for production use and high throughput.
67
+ - `rabbitmq`: RabbitMQ, for production use and high throughput.
68
+ - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
59
69
 
60
70
  ## Start syncing your data 🚀
61
71
 
@@ -0,0 +1,4 @@
1
+ from bizon.cli.main import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -1,12 +1,13 @@
1
1
  import click
2
2
 
3
3
  from bizon.engine.engine import RunnerFactory
4
+ from bizon.engine.runner.config import LoggerLevel
4
5
  from bizon.source.discover import discover_all_sources
5
6
 
6
7
  from .utils import (
7
8
  parse_from_yaml,
8
9
  set_custom_source_path_in_config,
9
- set_debug_mode,
10
+ set_log_level,
10
11
  set_runner_in_config,
11
12
  )
12
13
 
@@ -50,7 +51,7 @@ def stream():
50
51
 
51
52
  @stream.command()
52
53
  @click.argument("source_name", type=click.STRING)
53
- def list(source_name: str):
54
+ def list(source_name: str): # noqa
54
55
  """List available streams for a source."""
55
56
  sources = discover_all_sources()
56
57
  source_model = sources.get(source_name)
@@ -88,25 +89,29 @@ def destination():
88
89
  help="Runner type to use. Thread or Process.",
89
90
  )
90
91
  @click.option(
91
- "--debug",
92
+ "--log-level",
92
93
  required=False,
93
- is_flag=True,
94
+ type=click.Choice([level.name for level in LoggerLevel]),
94
95
  show_default=True,
95
- default=False,
96
- help="Enable debug mode.",
96
+ help="Log level to use.",
97
97
  )
98
- def run(filename: str, custom_source: str, runner: str, debug, help="Run a bizon pipeline from a YAML file."):
98
+ def run(
99
+ filename: str,
100
+ custom_source: str,
101
+ runner: str,
102
+ log_level: LoggerLevel,
103
+ help="Run a bizon pipeline from a YAML file.",
104
+ ):
99
105
  """Run a bizon pipeline from a YAML file."""
100
- ctx = click.get_current_context()
101
106
 
102
107
  # Parse config from YAML file as a dictionary
103
108
  config = parse_from_yaml(filename)
104
109
 
105
110
  # Set debug mode
106
- set_debug_mode(debug)
111
+ set_log_level(config=config, level=log_level)
107
112
 
108
113
  # Override source_file_path param in config
109
- set_custom_source_path_in_config(config=config, custom_source=ctx.get_parameter_source("custom-source"))
114
+ set_custom_source_path_in_config(config=config, custom_source=custom_source)
110
115
 
111
116
  # Override runner param in config
112
117
  set_runner_in_config(config=config, runner=runner)
@@ -114,6 +119,8 @@ def run(filename: str, custom_source: str, runner: str, debug, help="Run a bizon
114
119
  runner = RunnerFactory.create_from_config_dict(config=config)
115
120
  runner.run()
116
121
 
122
+ click.echo("Pipeline finished.")
123
+
117
124
 
118
125
  if __name__ == "__main__":
119
126
  cli()
@@ -1,5 +1,3 @@
1
- import os
2
-
3
1
  import yaml
4
2
 
5
3
 
@@ -9,10 +7,17 @@ def parse_from_yaml(path_to_yaml) -> dict:
9
7
  return config
10
8
 
11
9
 
12
- def set_debug_mode(debug: bool):
10
+ # TODO: Refacto
11
+ def set_log_level(config: dict, level: str):
13
12
  # Set Log Level to DEBUG
14
- if debug:
15
- os.environ["LOGURU_LEVEL"] = "DEBUG"
13
+ if level:
14
+ if "engine" not in config:
15
+ config["engine"] = {}
16
+
17
+ if "runner" not in config["engine"]:
18
+ config["engine"]["runner"] = {"log_level": level}
19
+
20
+ config["engine"]["runner"]["log_level"] = level
16
21
 
17
22
 
18
23
  def set_custom_source_path_in_config(config: dict, custom_source: str):
@@ -20,6 +25,7 @@ def set_custom_source_path_in_config(config: dict, custom_source: str):
20
25
  config["source"]["source_file_path"] = custom_source
21
26
 
22
27
 
28
+ # TODO: Refacto
23
29
  def set_runner_in_config(config: dict, runner: str):
24
30
  if runner:
25
31
  if "engine" not in config:
@@ -13,6 +13,8 @@ destination:
13
13
  # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
14
14
  name: bigquery
15
15
  config:
16
+ buffer_size: 10 # in Mb
17
+ buffer_flush_timeout: 300 # in seconds
16
18
  dataset_id: bizon_test
17
19
  dataset_location: US
18
20
  project_id: my-gcp-project-id
@@ -0,0 +1,76 @@
1
+ import sys
2
+ from datetime import datetime
3
+ from typing import List
4
+
5
+ from bizon.destinations.models import DestinationRecord
6
+
7
+
8
+ class DestinationBuffer:
9
+
10
+ def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
11
+ self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
12
+ self.buffer_flush_timeout = buffer_flush_timeout
13
+ self.records: List[DestinationRecord] = []
14
+ self._iterations: List[int] = []
15
+ self.pagination = {}
16
+ self.modified_at: List[datetime] = [datetime.utcnow()]
17
+
18
+ @property
19
+ def current_size(self) -> int:
20
+ """Return buffer size"""
21
+ return sys.getsizeof(self.records)
22
+
23
+ @property
24
+ def buffer_free_space_pct(self) -> float:
25
+ """Return free space in buffer in percentage"""
26
+ return round((self.buffer_free_space / self.buffer_size) * 100, 3)
27
+
28
+ @property
29
+ def from_iteration(self) -> int:
30
+ """Return the smallest iteration in buffer"""
31
+ if not self._iterations:
32
+ raise ValueError("Buffer is empty")
33
+ return min(self._iterations)
34
+
35
+ @property
36
+ def to_iteration(self) -> int:
37
+ """Return the largest iteration in buffer"""
38
+ if not self._iterations:
39
+ raise ValueError("Buffer is empty")
40
+ return max(self._iterations)
41
+
42
+ @property
43
+ def buffer_free_space(self) -> int:
44
+ """Return free space for records in buffer"""
45
+ assert self.current_size <= self.buffer_size, "Buffer size exceeded"
46
+ return self.buffer_size - self.current_size
47
+
48
+ @property
49
+ def ripeness(self) -> float:
50
+ """Return buffer ripeness"""
51
+ if self.buffer_flush_timeout == 0:
52
+ return 0
53
+ return round((max(self.modified_at) - min(self.modified_at)).seconds, 2)
54
+
55
+ @property
56
+ def is_ripe(self) -> bool:
57
+ """Check if buffer is ripe for flushing based on the timeout"""
58
+ if self.buffer_flush_timeout == 0:
59
+ return False
60
+ return (max(self.modified_at) - min(self.modified_at)).seconds >= self.buffer_flush_timeout
61
+
62
+ def flush(self):
63
+ """Flush buffer"""
64
+ self.records = []
65
+ self._iterations = []
66
+ self.pagination = {}
67
+ self.modified_at = []
68
+
69
+ def add_source_iteration_records_to_buffer(
70
+ self, iteration: int, records: List[DestinationRecord], pagination: dict = None
71
+ ):
72
+ """Add records for the given iteration to buffer"""
73
+ self.records.extend(records)
74
+ self._iterations.append(iteration)
75
+ self.pagination = pagination
76
+ self.modified_at.append(datetime.utcnow())
@@ -20,7 +20,14 @@ class NormalizationConfig(BaseModel):
20
20
 
21
21
 
22
22
  class AbstractDestinationDetailsConfig(BaseModel):
23
- buffer_size: int = Field(default=2000, description="Buffer size for the destination")
23
+ buffer_size: int = Field(
24
+ default=50,
25
+ description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
26
+ )
27
+ buffer_flush_timeout: int = Field(
28
+ default=600,
29
+ description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.",
30
+ )
24
31
  normalization: Optional[NormalizationConfig] = Field(
25
32
  description="Normalization configuration, by default no normalization is applied",
26
33
  default=NormalizationConfig(type=NormalizationType.NONE),
@@ -0,0 +1,266 @@
1
+ import sys
2
+ from abc import ABC, abstractmethod
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ from loguru import logger
8
+ from pydantic import BaseModel, Field
9
+
10
+ from bizon.common.models import SyncMetadata
11
+ from bizon.engine.backend.backend import AbstractBackend
12
+ from bizon.engine.backend.models import JobStatus
13
+ from bizon.source.config import SourceSyncModes
14
+ from bizon.source.models import SourceRecord
15
+
16
+ from .buffer import DestinationBuffer
17
+ from .config import (
18
+ AbstractDestinationConfig,
19
+ AbstractDestinationDetailsConfig,
20
+ DestinationTypes,
21
+ )
22
+ from .models import DestinationRecord
23
+
24
+
25
+ class DestinationBufferStatus(str, Enum):
26
+ """Destination buffer status"""
27
+
28
+ RECORDS_WRITTEN = "RECORDS_WRITTEN"
29
+ RECORDS_WRITTEN_THEN_BUFFERED = "RECORDS_WRITTEN_THEN_BUFFERED"
30
+ RECORDS_BUFFERED = "RECORDS_BUFFERED"
31
+ NO_RECORDS = "NO_RECORDS"
32
+
33
+
34
+ class DestinationIteration(BaseModel):
35
+ success: bool = Field(..., description="Success status of the iteration")
36
+ error_message: Optional[str] = Field(None, description="Error message if iteration failed")
37
+ records_written: int = Field(0, description="Number of records written to the destination")
38
+ from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
39
+ to_source_iteration: Optional[int] = Field(
40
+ None, description="To source iteration identifier buffer ends, inclusive"
41
+ )
42
+ pagination: Optional[dict] = Field(None, description="Source pagination for next interation recovery purposes")
43
+
44
+
45
+ class AbstractDestination(ABC):
46
+
47
+ def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
48
+ self.sync_metadata = sync_metadata
49
+ self.config = config
50
+ self.backend = backend
51
+ self.buffer = DestinationBuffer(
52
+ buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
53
+ )
54
+
55
+ @abstractmethod
56
+ def check_connection(self) -> bool:
57
+ pass
58
+
59
+ @abstractmethod
60
+ def delete_table(self) -> bool:
61
+ """Delete table in destination"""
62
+ pass
63
+
64
+ @abstractmethod
65
+ def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, Union[str, None]]:
66
+ """Write records to destination for the given iteration and return success status and error message"""
67
+ pass
68
+
69
+ @staticmethod
70
+ def convert_source_records_to_destination_records(
71
+ source_records: List[SourceRecord], extracted_at: datetime
72
+ ) -> List[DestinationRecord]:
73
+ """Convert source records to destination records"""
74
+ return [
75
+ DestinationRecord.from_source_record(source_record=source_record, extracted_at=extracted_at)
76
+ for source_record in source_records
77
+ ]
78
+
79
+ def prepare_destination(self):
80
+ """Prepare destination before writing records"""
81
+ # Delete table if full refresh
82
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
83
+ self.delete_table()
84
+
85
+ def buffer_flush_handler(self, session=None) -> DestinationIteration:
86
+ # TODO: Add retry strategy
87
+
88
+ # Initialize destination iteration
89
+ destination_iteration = DestinationIteration(
90
+ success=False,
91
+ records_written=0,
92
+ pagination=self.buffer.pagination,
93
+ )
94
+
95
+ success, error_msg = self.write_records(destination_records=self.buffer.records)
96
+
97
+ if success:
98
+ # We wrote records to destination so we keep it
99
+ destination_iteration.records_written = len(self.buffer.records)
100
+
101
+ else:
102
+ # We failed to write records to destination so we keep the error message
103
+ destination_iteration.error_message = error_msg
104
+
105
+ destination_iteration.success = success
106
+ destination_iteration.from_source_iteration = self.buffer.from_iteration
107
+ destination_iteration.to_source_iteration = self.buffer.to_iteration
108
+ destination_iteration.pagination = self.buffer.pagination
109
+
110
+ # Update destination cursor
111
+ self.create_cursors(destination_iteration=destination_iteration)
112
+
113
+ return destination_iteration
114
+
115
+ def write_or_buffer_records(
116
+ self,
117
+ destination_records: List[DestinationRecord],
118
+ iteration: int,
119
+ last_iteration: bool = False,
120
+ session=None,
121
+ pagination: dict = None,
122
+ ) -> DestinationBufferStatus:
123
+ """Write records to destination or buffer them for the given iteration"""
124
+
125
+ # Last iteration, write all records to destination
126
+ if last_iteration:
127
+ logger.debug("Writing last iteration records to destination")
128
+ assert len(destination_records) == 0, "Last iteration should not have any records"
129
+ destination_iteration = self.buffer_flush_handler(session=session)
130
+
131
+ if destination_iteration.success:
132
+ # Update job status to success
133
+ self.backend.update_stream_job_status(
134
+ job_id=self.sync_metadata.job_id,
135
+ job_status=JobStatus.SUCCEEDED,
136
+ session=session,
137
+ )
138
+
139
+ self.buffer.flush()
140
+ return DestinationBufferStatus.RECORDS_WRITTEN
141
+
142
+ # Don't write empty records to destination
143
+ if len(destination_records) == 0 and not last_iteration:
144
+ logger.warning("No records to write to destination. Check source and queue provider.")
145
+ return DestinationBufferStatus.NO_RECORDS
146
+
147
+ # Write records to destination if buffer size is 0
148
+ if self.buffer.buffer_size == 0:
149
+ logger.info("Writing last iteration records to destination")
150
+ self.buffer.add_source_iteration_records_to_buffer(
151
+ iteration=iteration, records=destination_records, pagination=pagination
152
+ )
153
+ self.buffer_flush_handler(session=session)
154
+ self.buffer.flush()
155
+ return DestinationBufferStatus.RECORDS_WRITTEN
156
+
157
+ logger.debug(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
158
+ logger.debug(f"Buffer current size {self.buffer.current_size} bytes")
159
+ logger.info(
160
+ f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min." # noqa
161
+ )
162
+
163
+ # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
164
+ if self.buffer.is_ripe:
165
+ logger.info(
166
+ f"Buffer is ripe (buffering for longer than buffer_flush_timeout: {self.buffer.buffer_flush_timeout} seconds), writing buffer to destination" # noqa
167
+ )
168
+ self.buffer_flush_handler(session=session)
169
+ self.buffer.flush()
170
+ self.buffer.add_source_iteration_records_to_buffer(
171
+ iteration=iteration, records=destination_records, pagination=pagination
172
+ )
173
+ return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
174
+
175
+ # Buffer can hold all records from this iteration
176
+ elif self.buffer.buffer_free_space >= sys.getsizeof(destination_records):
177
+ self.buffer.add_source_iteration_records_to_buffer(
178
+ iteration=iteration, records=destination_records, pagination=pagination
179
+ )
180
+ return DestinationBufferStatus.RECORDS_BUFFERED
181
+
182
+ # Buffer can contain some records from this iteration
183
+ # For now we will write all records to destination and then buffer the remaining records
184
+ else:
185
+ self.buffer_flush_handler(session=session)
186
+ self.buffer.flush()
187
+ self.buffer.add_source_iteration_records_to_buffer(
188
+ iteration=iteration, records=destination_records, pagination=pagination
189
+ )
190
+ return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
191
+
192
+ def create_cursors(self, destination_iteration: DestinationIteration):
193
+ self.backend.create_destination_cursor(
194
+ job_id=self.sync_metadata.job_id,
195
+ source_name=self.sync_metadata.source_name,
196
+ stream_name=self.sync_metadata.stream_name,
197
+ destination_name=self.sync_metadata.destination_name,
198
+ from_source_iteration=destination_iteration.from_source_iteration,
199
+ to_source_iteration=destination_iteration.to_source_iteration,
200
+ rows_written=destination_iteration.records_written,
201
+ pagination=destination_iteration.pagination,
202
+ success=destination_iteration.success,
203
+ )
204
+
205
+ def write_records_and_update_cursor(
206
+ self,
207
+ source_records: List[SourceRecord],
208
+ iteration: int,
209
+ extracted_at: datetime,
210
+ last_iteration: bool = False,
211
+ pagination: dict = None,
212
+ ) -> bool:
213
+ """
214
+ Write records to destination and update the cursor for the given iteration.
215
+ Stores the source pagination for recovery purposes.
216
+ """
217
+
218
+ # Case when producer failed to fetch data from first iteration
219
+ if iteration == 0 and len(source_records) == 0:
220
+ logger.warning("Source failed to fetch data from the first iteration, no records will be written.")
221
+ return False
222
+
223
+ # Prepare destination
224
+ if iteration == 0:
225
+ self.prepare_destination()
226
+
227
+ destination_records = self.convert_source_records_to_destination_records(
228
+ source_records=source_records,
229
+ extracted_at=extracted_at,
230
+ )
231
+
232
+ # Buffer records otherwise write to destination
233
+ self.write_or_buffer_records(
234
+ destination_records=destination_records,
235
+ iteration=iteration,
236
+ last_iteration=last_iteration,
237
+ pagination=pagination,
238
+ )
239
+
240
+ return True
241
+
242
+
243
+ class DestinationFactory:
244
+ @staticmethod
245
+ def get_destination(
246
+ sync_metadata: SyncMetadata,
247
+ config: AbstractDestinationConfig,
248
+ backend: AbstractBackend,
249
+ ) -> AbstractDestination:
250
+
251
+ if config.name == DestinationTypes.LOGGER:
252
+ from .logger.src.destination import LoggerDestination
253
+
254
+ return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
255
+
256
+ elif config.name == DestinationTypes.BIGQUERY:
257
+ from .bigquery.src.destination import BigQueryDestination
258
+
259
+ return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
260
+
261
+ elif config.name == DestinationTypes.FILE:
262
+ from .file.src.destination import FileDestination
263
+
264
+ return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
265
+
266
+ raise ValueError(f"Destination {config.name}" f"with params {config} not found")
@@ -1,4 +1,3 @@
1
- import json
2
1
  from typing import List, Tuple
3
2
 
4
3
  from loguru import logger
@@ -26,6 +25,5 @@ class FileDestination(AbstractDestination):
26
25
  def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
27
26
  with open(self.config.filepath, "a") as f:
28
27
  for record in destination_records:
29
- json.dump(record.model_dump_json(), f)
30
- f.write("\n")
28
+ f.write(record.model_dump_json() + "\n")
31
29
  return True, ""
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  from datetime import datetime
3
- from typing import Optional
4
3
  from uuid import uuid4
5
4
 
6
5
  from pydantic import BaseModel, Field
@@ -9,14 +8,6 @@ from pytz import UTC
9
8
  from bizon.source.models import SourceRecord
10
9
 
11
10
 
12
- class DestinationIteration(BaseModel):
13
- success: bool = Field(..., description="Success status of the iteration")
14
- error_message: Optional[str] = Field(None, description="Error message if iteration failed")
15
- records_written: int = Field(0, description="Number of records written to the destination")
16
- from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
17
- to_source_iteration: Optional[int] = Field(None, description="To source iteration identifier buffer ends")
18
-
19
-
20
11
  class DestinationRecord(BaseModel):
21
12
  bizon_id: str = Field(..., description="Bizon unique identifier of the record")
22
13
  bizon_extracted_at: datetime = Field(..., description="Datetime when the record was extracted")