bizon 0.0.4.dev2__tar.gz → 0.0.4.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/PKG-INFO +13 -3
  2. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/README.md +12 -2
  3. bizon-0.0.4.dev3/bizon/__main__.py +4 -0
  4. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/main.py +3 -1
  5. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/config/bigquery.example.yml +2 -0
  6. bizon-0.0.4.dev3/bizon/destinations/buffer.py +76 -0
  7. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/config.py +8 -1
  8. bizon-0.0.4.dev3/bizon/destinations/destination.py +260 -0
  9. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/file/src/destination.py +1 -3
  10. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/models.py +0 -9
  11. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/adapters/sqlalchemy/backend.py +28 -34
  12. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/backend.py +15 -24
  13. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/models.py +5 -2
  14. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/config.py +1 -1
  15. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/engine.py +3 -3
  16. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/producer.py +14 -12
  17. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/consumer.py +2 -0
  18. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -2
  19. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/consumer.py +2 -0
  20. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/queue.py +8 -1
  21. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/adapters/process.py +3 -4
  22. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/adapters/thread.py +24 -6
  23. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/runner.py +6 -8
  24. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/discover.py +92 -81
  25. bizon-0.0.4.dev3/bizon/sources/hubspot/src/hubspot_base.py +89 -0
  26. bizon-0.0.4.dev2/bizon/sources/hubspot/src/source_objects.py → bizon-0.0.4.dev3/bizon/sources/hubspot/src/hubspot_objects.py +4 -81
  27. bizon-0.0.4.dev3/bizon/sources/hubspot/tests/hubspot_pipeline.py +7 -0
  28. bizon-0.0.4.dev3/bizon/utils.py +3 -0
  29. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/pyproject.toml +1 -1
  30. bizon-0.0.4.dev2/bizon/destinations/buffer.py +0 -49
  31. bizon-0.0.4.dev2/bizon/destinations/destination.py +0 -208
  32. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_benchmark.py +0 -26
  33. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_client.py +0 -25
  34. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_iteration.py +0 -30
  35. bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -6
  36. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/LICENSE +0 -0
  37. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/__init__.py +0 -0
  38. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/utils.py +0 -0
  39. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/errors/backoff.py +0 -0
  40. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/errors/errors.py +0 -0
  41. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/models.py +0 -0
  42. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/src/config.py +0 -0
  43. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/src/destination.py +0 -0
  44. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/tests/test_bigquery_client.py +0 -0
  45. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/file/src/config.py +0 -0
  46. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/logger/src/config.py +0 -0
  47. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/logger/src/destination.py +0 -0
  48. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  49. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/config.py +0 -0
  50. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/consumer.py +0 -0
  51. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/models.py +0 -0
  52. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  53. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
  54. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
  55. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
  56. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  57. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
  58. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/config.py +0 -0
  59. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/config.py +0 -0
  60. {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/models.py +0 -0
  61. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  62. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  63. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/basic.py +0 -0
  64. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/cookies.py +0 -0
  65. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/oauth.py +0 -0
  66. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/token.py +0 -0
  67. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/builder.py +0 -0
  68. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/config.py +0 -0
  69. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/config.py +0 -0
  70. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/cursor.py +0 -0
  71. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/models.py +0 -0
  72. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/session.py +0 -0
  73. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/source.py +0 -0
  74. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/config/api_key.example.yml +0 -0
  75. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
  76. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/src/fake_api.py +0 -0
  77. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/src/source.py +0 -0
  78. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
  79. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
  80. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
  81. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
  82. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
  83. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
  84. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
  85. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
  86. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/src/source.py +0 -0
  87. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  88. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
  89. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
  90. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
  91. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
  92. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
  93. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/src/source.py +0 -0
  94. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  95. {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.4.dev2
3
+ Version: 0.0.4.dev3
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -45,7 +45,8 @@ Extract and load your largest data streams with a framework you can trust for bi
45
45
 
46
46
  ## Features
47
47
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
48
- - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
48
+ - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
49
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
49
50
  - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
50
51
  - ETAs for completion
51
52
  - Number of records processed
@@ -64,6 +65,14 @@ pip install bizon
64
65
 
65
66
  ## Usage
66
67
 
68
+ ### List available sources and streams
69
+ ```bash
70
+ bizon source list
71
+ bizon stream list <source_name>
72
+ ```
73
+
74
+ ### Create a pipeline
75
+
67
76
  Create a file named `config.yml` in your working directory with the following content:
68
77
 
69
78
  ```yaml
@@ -97,7 +106,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
97
106
 
98
107
  Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
99
108
  - `python_queue`: Python Queue, useful for testing and development.
100
- - `kafka`: Apache Kafka, for production use and high throughput.
109
+ - `rabbitmq`: RabbitMQ, for production use and high throughput.
110
+ - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
101
111
 
102
112
  ## Start syncing your data 🚀
103
113
 
@@ -3,7 +3,8 @@ Extract and load your largest data streams with a framework you can trust for bi
3
3
 
4
4
  ## Features
5
5
  - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
6
- - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
6
+ - **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
7
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
7
8
  - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
8
9
  - ETAs for completion
9
10
  - Number of records processed
@@ -22,6 +23,14 @@ pip install bizon
22
23
 
23
24
  ## Usage
24
25
 
26
+ ### List available sources and streams
27
+ ```bash
28
+ bizon source list
29
+ bizon stream list <source_name>
30
+ ```
31
+
32
+ ### Create a pipeline
33
+
25
34
  Create a file named `config.yml` in your working directory with the following content:
26
35
 
27
36
  ```yaml
@@ -55,7 +64,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
55
64
 
56
65
  Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
57
66
  - `python_queue`: Python Queue, useful for testing and development.
58
- - `kafka`: Apache Kafka, for production use and high throughput.
67
+ - `rabbitmq`: RabbitMQ, for production use and high throughput.
68
+ - `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
59
69
 
60
70
  ## Start syncing your data 🚀
61
71
 
@@ -0,0 +1,4 @@
1
+ from bizon.cli.main import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -50,7 +50,7 @@ def stream():
50
50
 
51
51
  @stream.command()
52
52
  @click.argument("source_name", type=click.STRING)
53
- def list(source_name: str):
53
+ def list(source_name: str): # noqa
54
54
  """List available streams for a source."""
55
55
  sources = discover_all_sources()
56
56
  source_model = sources.get(source_name)
@@ -114,6 +114,8 @@ def run(filename: str, custom_source: str, runner: str, debug, help="Run a bizon
114
114
  runner = RunnerFactory.create_from_config_dict(config=config)
115
115
  runner.run()
116
116
 
117
+ click.echo("Pipeline finished.")
118
+
117
119
 
118
120
  if __name__ == "__main__":
119
121
  cli()
@@ -13,6 +13,8 @@ destination:
13
13
  # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
14
14
  name: bigquery
15
15
  config:
16
+ buffer_size: 10 # in Mb
17
+ buffer_flush_timeout: 300 # in seconds
16
18
  dataset_id: bizon_test
17
19
  dataset_location: US
18
20
  project_id: my-gcp-project-id
@@ -0,0 +1,76 @@
1
+ import sys
2
+ from datetime import datetime
3
+ from typing import List
4
+
5
+ from bizon.destinations.models import DestinationRecord
6
+
7
+
8
+ class DestinationBuffer:
9
+
10
+ def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
11
+ self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
12
+ self.buffer_flush_timeout = buffer_flush_timeout
13
+ self.records: List[DestinationRecord] = []
14
+ self._iterations: List[int] = []
15
+ self.pagination = {}
16
+ self.modified_at: List[datetime] = [datetime.utcnow()]
17
+
18
+ @property
19
+ def current_size(self) -> int:
20
+ """Return buffer size"""
21
+ return sys.getsizeof(self.records)
22
+
23
+ @property
24
+ def buffer_free_space_pct(self) -> float:
25
+ """Return free space in buffer in percentage"""
26
+ return round((self.buffer_free_space / self.buffer_size) * 100, 3)
27
+
28
+ @property
29
+ def from_iteration(self) -> int:
30
+ """Return the smallest iteration in buffer"""
31
+ if not self._iterations:
32
+ raise ValueError("Buffer is empty")
33
+ return min(self._iterations)
34
+
35
+ @property
36
+ def to_iteration(self) -> int:
37
+ """Return the largest iteration in buffer"""
38
+ if not self._iterations:
39
+ raise ValueError("Buffer is empty")
40
+ return max(self._iterations)
41
+
42
+ @property
43
+ def buffer_free_space(self) -> int:
44
+ """Return free space for records in buffer"""
45
+ assert self.current_size <= self.buffer_size, "Buffer size exceeded"
46
+ return self.buffer_size - self.current_size
47
+
48
+ @property
49
+ def ripeness(self) -> float:
50
+ """Return buffer ripeness"""
51
+ if self.buffer_flush_timeout == 0:
52
+ return 0
53
+ return round((max(self.modified_at) - min(self.modified_at)).seconds, 2)
54
+
55
+ @property
56
+ def is_ripe(self) -> bool:
57
+ """Check if buffer is ripe for flushing based on the timeout"""
58
+ if self.buffer_flush_timeout == 0:
59
+ return False
60
+ return (max(self.modified_at) - min(self.modified_at)).seconds >= self.buffer_flush_timeout
61
+
62
+ def flush(self):
63
+ """Flush buffer"""
64
+ self.records = []
65
+ self._iterations = []
66
+ self.pagination = {}
67
+ self.modified_at = []
68
+
69
+ def add_source_iteration_records_to_buffer(
70
+ self, iteration: int, records: List[DestinationRecord], pagination: dict = None
71
+ ):
72
+ """Add records for the given iteration to buffer"""
73
+ self.records.extend(records)
74
+ self._iterations.append(iteration)
75
+ self.pagination = pagination
76
+ self.modified_at.append(datetime.utcnow())
@@ -20,7 +20,14 @@ class NormalizationConfig(BaseModel):
20
20
 
21
21
 
22
22
  class AbstractDestinationDetailsConfig(BaseModel):
23
- buffer_size: int = Field(default=2000, description="Buffer size for the destination")
23
+ buffer_size: int = Field(
24
+ default=50,
25
+ description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
26
+ )
27
+ buffer_flush_timeout: int = Field(
28
+ default=600,
29
+ description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.",
30
+ )
24
31
  normalization: Optional[NormalizationConfig] = Field(
25
32
  description="Normalization configuration, by default no normalization is applied",
26
33
  default=NormalizationConfig(type=NormalizationType.NONE),
@@ -0,0 +1,260 @@
1
+ import sys
2
+ from abc import ABC, abstractmethod
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from typing import List, Optional, Tuple, Union
6
+
7
+ from loguru import logger
8
+ from pydantic import BaseModel, Field
9
+
10
+ from bizon.common.models import SyncMetadata
11
+ from bizon.engine.backend.backend import AbstractBackend
12
+ from bizon.engine.backend.models import JobStatus
13
+ from bizon.source.config import SourceSyncModes
14
+ from bizon.source.models import SourceRecord
15
+
16
+ from .buffer import DestinationBuffer
17
+ from .config import (
18
+ AbstractDestinationConfig,
19
+ AbstractDestinationDetailsConfig,
20
+ DestinationTypes,
21
+ )
22
+ from .models import DestinationRecord
23
+
24
+
25
+ class DestinationBufferStatus(str, Enum):
26
+ """Destination buffer status"""
27
+
28
+ RECORDS_WRITTEN = "RECORDS_WRITTEN"
29
+ RECORDS_WRITTEN_THEN_BUFFERED = "RECORDS_WRITTEN_THEN_BUFFERED"
30
+ RECORDS_BUFFERED = "RECORDS_BUFFERED"
31
+ NO_RECORDS = "NO_RECORDS"
32
+
33
+
34
+ class DestinationIteration(BaseModel):
35
+ success: bool = Field(..., description="Success status of the iteration")
36
+ error_message: Optional[str] = Field(None, description="Error message if iteration failed")
37
+ records_written: int = Field(0, description="Number of records written to the destination")
38
+ from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
39
+ to_source_iteration: Optional[int] = Field(
40
+ None, description="To source iteration identifier buffer ends, inclusive"
41
+ )
42
+ pagination: Optional[dict] = Field(None, description="Source pagination for next interation recovery purposes")
43
+
44
+
45
+ class AbstractDestination(ABC):
46
+
47
+ def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
48
+ self.sync_metadata = sync_metadata
49
+ self.config = config
50
+ self.backend = backend
51
+ self.buffer = DestinationBuffer(
52
+ buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
53
+ )
54
+
55
+ @abstractmethod
56
+ def check_connection(self) -> bool:
57
+ pass
58
+
59
+ @abstractmethod
60
+ def delete_table(self) -> bool:
61
+ """Delete table in destination"""
62
+ pass
63
+
64
+ @abstractmethod
65
+ def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, Union[str, None]]:
66
+ """Write records to destination for the given iteration and return success status and error message"""
67
+ pass
68
+
69
+ @staticmethod
70
+ def convert_source_records_to_destination_records(
71
+ source_records: List[SourceRecord], extracted_at: datetime
72
+ ) -> List[DestinationRecord]:
73
+ """Convert source records to destination records"""
74
+ return [
75
+ DestinationRecord.from_source_record(source_record=source_record, extracted_at=extracted_at)
76
+ for source_record in source_records
77
+ ]
78
+
79
+ def prepare_destination(self):
80
+ """Prepare destination before writing records"""
81
+ # Delete table if full refresh
82
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
83
+ self.delete_table()
84
+
85
+ def buffer_flush_handler(self, session=None) -> DestinationIteration:
86
+ nb_records_to_write = len(self.buffer.records)
87
+ # Initialize destination iteration
88
+ destination_iteration = DestinationIteration(
89
+ success=False,
90
+ records_written=0,
91
+ pagination=self.buffer.pagination,
92
+ )
93
+ success, error_msg = self.write_records(destination_records=self.buffer.records)
94
+ if success:
95
+ destination_iteration.records_written = nb_records_to_write
96
+ self.backend.update_stream_job_status(
97
+ job_id=self.sync_metadata.job_id,
98
+ job_status=JobStatus.SUCCEEDED,
99
+ session=session,
100
+ )
101
+
102
+ else:
103
+ self.backend.update_stream_job_status(
104
+ job_id=self.sync_metadata.job_id,
105
+ job_status=JobStatus.FAILED,
106
+ error_message=error_msg,
107
+ session=session,
108
+ )
109
+
110
+ destination_iteration.success = success
111
+ destination_iteration.from_source_iteration = self.buffer.from_iteration
112
+ destination_iteration.to_source_iteration = self.buffer.to_iteration
113
+ destination_iteration.pagination = self.buffer.pagination
114
+
115
+ # Update destination cursor
116
+ self.create_cursors(destination_iteration=destination_iteration)
117
+
118
+ def write_or_buffer_records(
119
+ self,
120
+ destination_records: List[DestinationRecord],
121
+ iteration: int,
122
+ last_iteration: bool = False,
123
+ session=None,
124
+ pagination: dict = None,
125
+ ) -> DestinationBufferStatus:
126
+ """Write records to destination or buffer them for the given iteration"""
127
+
128
+ # Last iteration, write all records to destination
129
+ if last_iteration:
130
+ logger.debug("Writing last iteration records to destination")
131
+ assert len(destination_records) == 0, "Last iteration should not have any records"
132
+ self.buffer_flush_handler(session=session)
133
+ self.buffer.flush()
134
+ return DestinationBufferStatus.RECORDS_WRITTEN
135
+
136
+ # Don't write empty records to destination
137
+ if len(destination_records) == 0 and not last_iteration:
138
+ logger.warning("No records to write to destination. Check source and queue provider.")
139
+ return DestinationBufferStatus.NO_RECORDS
140
+
141
+ # Write records to destination if buffer size is 0
142
+ if self.buffer.buffer_size == 0:
143
+ logger.debug("Writing last iteration records to destination")
144
+ self.buffer.add_source_iteration_records_to_buffer(
145
+ iteration=iteration, records=destination_records, pagination=pagination
146
+ )
147
+ self.buffer_flush_handler(session=session)
148
+ self.buffer.flush()
149
+ return DestinationBufferStatus.RECORDS_WRITTEN
150
+
151
+ logger.info(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
152
+ logger.info(f"Buffer current size {self.buffer.current_size} bytes")
153
+ logger.info(
154
+ f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min." # noqa
155
+ )
156
+
157
+ # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
158
+ if self.buffer.is_ripe:
159
+ logger.debug(
160
+ f"Buffer is ripe (buffering for longer than buffer_flush_timeout: {self.buffer.buffer_flush_timeout} seconds), writing buffer to destination" # noqa
161
+ )
162
+ self.buffer_flush_handler(session=session)
163
+ self.buffer.flush()
164
+ self.buffer.add_source_iteration_records_to_buffer(
165
+ iteration=iteration, records=destination_records, pagination=pagination
166
+ )
167
+ return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
168
+
169
+ # Buffer can hold all records from this iteration
170
+ elif self.buffer.buffer_free_space >= sys.getsizeof(destination_records):
171
+ self.buffer.add_source_iteration_records_to_buffer(
172
+ iteration=iteration, records=destination_records, pagination=pagination
173
+ )
174
+ return DestinationBufferStatus.RECORDS_BUFFERED
175
+
176
+ # Buffer can contain some records from this iteration
177
+ # For now we will write all records to destination and then buffer the remaining records
178
+ else:
179
+ self.buffer_flush_handler(session=session)
180
+ self.buffer.flush()
181
+ self.buffer.add_source_iteration_records_to_buffer(
182
+ iteration=iteration, records=destination_records, pagination=pagination
183
+ )
184
+ return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
185
+
186
+ def create_cursors(self, destination_iteration: DestinationIteration):
187
+ self.backend.create_destination_cursor(
188
+ job_id=self.sync_metadata.job_id,
189
+ source_name=self.sync_metadata.source_name,
190
+ stream_name=self.sync_metadata.stream_name,
191
+ destination_name=self.sync_metadata.destination_name,
192
+ from_source_iteration=destination_iteration.from_source_iteration,
193
+ to_source_iteration=destination_iteration.to_source_iteration,
194
+ rows_written=destination_iteration.records_written,
195
+ pagination=destination_iteration.pagination,
196
+ success=destination_iteration.success,
197
+ )
198
+
199
+ def write_records_and_update_cursor(
200
+ self,
201
+ source_records: List[SourceRecord],
202
+ iteration: int,
203
+ extracted_at: datetime,
204
+ last_iteration: bool = False,
205
+ pagination: dict = None,
206
+ ) -> bool:
207
+ """
208
+ Write records to destination and update the cursor for the given iteration.
209
+ Stores the source pagination for recovery purposes.
210
+ """
211
+
212
+ # Case when producer failed to fetch data from first iteration
213
+ if iteration == 0 and len(source_records) == 0:
214
+ logger.warning("Source failed to fetch data from the first iteration, no records will be written.")
215
+ return False
216
+
217
+ # Prepare destination
218
+ if iteration == 0:
219
+ self.prepare_destination()
220
+
221
+ destination_records = self.convert_source_records_to_destination_records(
222
+ source_records=source_records,
223
+ extracted_at=extracted_at,
224
+ )
225
+
226
+ # Buffer records otherwise write to destination
227
+ self.write_or_buffer_records(
228
+ destination_records=destination_records,
229
+ iteration=iteration,
230
+ last_iteration=last_iteration,
231
+ pagination=pagination,
232
+ )
233
+
234
+ return True
235
+
236
+
237
+ class DestinationFactory:
238
+ @staticmethod
239
+ def get_destination(
240
+ sync_metadata: SyncMetadata,
241
+ config: AbstractDestinationConfig,
242
+ backend: AbstractBackend,
243
+ ) -> AbstractDestination:
244
+
245
+ if config.name == DestinationTypes.LOGGER:
246
+ from .logger.src.destination import LoggerDestination
247
+
248
+ return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
249
+
250
+ elif config.name == DestinationTypes.BIGQUERY:
251
+ from .bigquery.src.destination import BigQueryDestination
252
+
253
+ return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
254
+
255
+ elif config.name == DestinationTypes.FILE:
256
+ from .file.src.destination import FileDestination
257
+
258
+ return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
259
+
260
+ raise ValueError(f"Destination {config.name}" f"with params {config} not found")
@@ -1,4 +1,3 @@
1
- import json
2
1
  from typing import List, Tuple
3
2
 
4
3
  from loguru import logger
@@ -26,6 +25,5 @@ class FileDestination(AbstractDestination):
26
25
  def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
27
26
  with open(self.config.filepath, "a") as f:
28
27
  for record in destination_records:
29
- json.dump(record.model_dump_json(), f)
30
- f.write("\n")
28
+ f.write(record.model_dump_json() + "\n")
31
29
  return True, ""
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  from datetime import datetime
3
- from typing import Optional
4
3
  from uuid import uuid4
5
4
 
6
5
  from pydantic import BaseModel, Field
@@ -9,14 +8,6 @@ from pytz import UTC
9
8
  from bizon.source.models import SourceRecord
10
9
 
11
10
 
12
- class DestinationIteration(BaseModel):
13
- success: bool = Field(..., description="Success status of the iteration")
14
- error_message: Optional[str] = Field(None, description="Error message if iteration failed")
15
- records_written: int = Field(0, description="Number of records written to the destination")
16
- from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
17
- to_source_iteration: Optional[int] = Field(None, description="To source iteration identifier buffer ends")
18
-
19
-
20
11
  class DestinationRecord(BaseModel):
21
12
  bizon_id: str = Field(..., description="Bizon unique identifier of the record")
22
13
  bizon_extracted_at: datetime = Field(..., description="Datetime when the record was extracted")
@@ -4,7 +4,7 @@ from typing import Generator, Optional, Union
4
4
 
5
5
  from loguru import logger
6
6
  from pytz import UTC
7
- from sqlalchemy import Result, Select, create_engine, inspect, select, update
7
+ from sqlalchemy import Result, Select, create_engine, func, inspect, select, update
8
8
  from sqlalchemy.engine import Engine, create_engine
9
9
  from sqlalchemy.orm import Session, scoped_session, sessionmaker
10
10
 
@@ -27,7 +27,7 @@ from .config import BigQueryConfigDetails, PostgresConfigDetails, SQLiteConfigDe
27
27
 
28
28
  class SQLAlchemyBackend(AbstractBackend):
29
29
 
30
- def __init__(self, config: Union[PostgresConfigDetails, SQLiteConfigDetails], type: BackendTypes):
30
+ def __init__(self, config: Union[PostgresConfigDetails, SQLiteConfigDetails], type: BackendTypes, **kwargs):
31
31
  super().__init__(config, type)
32
32
 
33
33
  self._engine = None
@@ -38,6 +38,8 @@ class SQLAlchemyBackend(AbstractBackend):
38
38
  BigQueryConfigDetails,
39
39
  ] = config
40
40
 
41
+ self.kwargs = kwargs
42
+
41
43
  @property
42
44
  def session(self) -> Generator[Session, None, None]:
43
45
  """yields a SQLAlchemy connection"""
@@ -125,7 +127,8 @@ class SQLAlchemyBackend(AbstractBackend):
125
127
  def get_engine(self) -> Engine:
126
128
  """Return the SQLAlchemy engine"""
127
129
  if self.type == BackendTypes.SQLITE:
128
- return self._get_engine()
130
+ assert "sqlalchemy_engine" in self.kwargs
131
+ return self.kwargs["sqlalchemy_engine"]
129
132
 
130
133
  if not self._engine:
131
134
  self._engine = self._get_engine()
@@ -310,20 +313,16 @@ class SQLAlchemyBackend(AbstractBackend):
310
313
  logger.warning(f"No cursor found for id={cursor_id}")
311
314
  return None
312
315
 
313
- def get_last_source_cursor_by_job_id(
314
- self, job_id: str, session: Optional[Session] = None
315
- ) -> Optional[SourceCursor]:
316
- """Get the last cursor we should start source from for the given job id
317
- The cursor should have status CursorStatus.WRITTEN_IN_DESTINATION
318
- """
316
+ def get_last_cursor_by_job_id(self, job_id: str, session: Optional[Session] = None) -> Optional[SourceCursor]:
317
+ """Get the last cursor we should start source from for the given job id"""
319
318
 
320
319
  smt = (
321
- select(SourceCursor)
320
+ select(DestinationCursor)
322
321
  .filter(
323
- SourceCursor.job_id == job_id,
324
- SourceCursor.status == CursorStatus.WRITTEN_IN_DESTINATION,
322
+ DestinationCursor.job_id == job_id,
323
+ DestinationCursor.success == True, # noqa
325
324
  )
326
- .order_by(SourceCursor.iteration.desc())
325
+ .order_by(DestinationCursor.to_source_iteration.desc())
327
326
  )
328
327
 
329
328
  results = self._execute(smt, session=session).first()
@@ -355,8 +354,6 @@ class SQLAlchemyBackend(AbstractBackend):
355
354
  logger.warning(f"No last cursor found for source={source_name} stream={stream_name} iteration={iteration}")
356
355
  return None
357
356
 
358
- ### DESTINATION CURSOR ###
359
-
360
357
  def create_destination_cursor(
361
358
  self,
362
359
  job_id: str,
@@ -366,6 +363,8 @@ class SQLAlchemyBackend(AbstractBackend):
366
363
  from_source_iteration: int,
367
364
  to_source_iteration: int,
368
365
  rows_written: int,
366
+ success: bool,
367
+ pagination: Optional[dict] = None,
369
368
  session: Session | None = None,
370
369
  ) -> DestinationCursor:
371
370
 
@@ -377,30 +376,13 @@ class SQLAlchemyBackend(AbstractBackend):
377
376
  from_source_iteration=from_source_iteration,
378
377
  to_source_iteration=to_source_iteration,
379
378
  rows_written=rows_written,
379
+ pagination=json.dumps(pagination) if pagination else None,
380
+ success=success,
380
381
  )
381
382
  new_destination_cursor = self._add_and_commit(destination_cursor, session=session)
382
383
  logger.debug(f"New Destination Cursor has been created: {new_destination_cursor}")
383
384
  return new_destination_cursor
384
385
 
385
- def update_destination_cursor(
386
- self,
387
- cursor_id: str,
388
- rows_written: int,
389
- status: CursorStatus,
390
- error_message: Optional[str] = None,
391
- session: Session | None = None,
392
- ):
393
- """Update the status of the stream job with the given id"""
394
- stmt = (
395
- update(DestinationCursor)
396
- .where(DestinationCursor.id == cursor_id)
397
- .values(
398
- status=status, error_message=error_message, rows_written=rows_written, updated_at=datetime.now(tz=UTC)
399
- )
400
- .execution_options(synchronize_session="fetch")
401
- )
402
- self._execute(stmt, session=session)
403
-
404
386
  def get_destination_cursor_by_id(
405
387
  self, cursor_id: str, session: Optional[Session] = None
406
388
  ) -> Optional[DestinationCursor]:
@@ -414,3 +396,15 @@ class SQLAlchemyBackend(AbstractBackend):
414
396
  return results[0]
415
397
  logger.warning(f"No job found for id={cursor_id}")
416
398
  return None
399
+
400
+ def get_number_of_written_rows_for_job(self, job_id: str, session: Optional[Session] = None) -> Optional[int]:
401
+ """Get the number of written rows for the given job"""
402
+ smt = select(func.sum(DestinationCursor.rows_written)).filter(
403
+ DestinationCursor.job_id == job_id,
404
+ DestinationCursor.success == True, # noqa
405
+ )
406
+ results = self._execute(smt, session=session).one_or_none()
407
+ if results:
408
+ return results[0]
409
+ else:
410
+ raise Exception(f"No rows written found for job_id={job_id}")