bizon 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,22 +2,19 @@ import io
2
2
  import json
3
3
  import os
4
4
  import tempfile
5
+ import traceback
5
6
  from typing import List, Tuple
6
7
  from uuid import uuid4
7
8
 
8
- import pandas as pd
9
- import pyarrow as pa
10
- import pyarrow.parquet as pq
9
+ import polars as pl
11
10
  from google.api_core.exceptions import NotFound
12
11
  from google.cloud import bigquery, storage
13
12
  from google.cloud.bigquery import DatasetReference, TimePartitioning
14
13
  from loguru import logger
15
- from pytz import UTC
16
14
 
17
15
  from bizon.common.models import SyncMetadata
18
16
  from bizon.destinations.config import NormalizationType
19
17
  from bizon.destinations.destination import AbstractDestination
20
- from bizon.destinations.models import DestinationRecord
21
18
  from bizon.engine.backend.backend import AbstractBackend
22
19
  from bizon.source.config import SourceSyncModes
23
20
 
@@ -62,7 +59,7 @@ class BigQueryDestination(AbstractDestination):
62
59
  elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
63
60
  return f"{self.table_id}"
64
61
 
65
- def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
62
+ def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
66
63
 
67
64
  # we keep raw data in the column source_data
68
65
  if self.config.normalization.type == NormalizationType.NONE:
@@ -77,26 +74,13 @@ class BigQueryDestination(AbstractDestination):
77
74
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
78
75
  ]
79
76
 
80
- elif self.config.normalization.type == NormalizationType.DEBEZIUM:
81
- assert (
82
- "_bizon_message_key" in destination_records[0].source_data
83
- ), "Debezium records must have a '_bizon_message_key' key"
84
- message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
85
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
86
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
87
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
88
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
89
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
90
- bigquery.SchemaField(
91
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
92
- ),
93
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
94
- ]
95
-
96
77
  # If normalization is tabular, we parse key / value pairs to columns
97
78
  elif self.config.normalization.type == NormalizationType.TABULAR:
98
- first_record_keys = destination_records[0].source_data.keys()
99
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
79
+
80
+ # We use the first record to infer the schema of tabular data (key / value pairs)
81
+ source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
82
+
83
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
100
84
  bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
101
85
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
102
86
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
@@ -108,58 +92,6 @@ class BigQueryDestination(AbstractDestination):
108
92
 
109
93
  raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
110
94
 
111
- def get_batch_records_as_df(self, destination_records: List[DestinationRecord]) -> pd.DataFrame:
112
-
113
- # We keep raw data in a column -> convert the SourceRecord to a DestinationRecord
114
- if self.config.normalization.type == NormalizationType.NONE:
115
- df = pd.DataFrame([record.to_dict_raw_json_data(parquet=True) for record in destination_records])
116
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
117
-
118
- # If normalization is tabular, we can just convert the data to a DataFrame parsing first-level keys
119
- elif self.config.normalization.type == NormalizationType.TABULAR:
120
- list_data_dict = [record.source_data for record in destination_records]
121
- df = pd.DataFrame(list_data_dict).astype(str)
122
- df["_bizon_id"] = [uuid4().hex for _ in range(len(destination_records))]
123
-
124
- df["_bizon_extracted_at"] = [
125
- int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
126
- ]
127
-
128
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
129
-
130
- df["_source_record_id"] = [record.source_record_id for record in destination_records]
131
-
132
- # We need to convert the source datetime to a int timestamp
133
- df["_source_timestamp"] = [
134
- int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
135
- ]
136
-
137
- elif self.config.normalization.type == NormalizationType.DEBEZIUM:
138
- df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
139
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
140
-
141
- else:
142
- raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
143
-
144
- return df
145
-
146
- def convert_and_upload_to_buffer(self, destination_records: List[DestinationRecord]):
147
-
148
- df = self.get_batch_records_as_df(destination_records)
149
-
150
- # Convert DataFrame to Parquet in-memory
151
- if self.buffer_format == "parquet":
152
- table = pa.Table.from_pandas(df)
153
- buffer = io.BytesIO()
154
- pq.write_table(table, buffer)
155
- buffer.seek(0)
156
-
157
- # Upload the Parquet file to GCS
158
- file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
159
- blob = self.buffer_bucket.blob(file_name)
160
- blob.upload_from_file(buffer, content_type="application/octet-stream")
161
- return file_name
162
-
163
95
  def check_connection(self) -> bool:
164
96
  dataset_ref = DatasetReference(self.project_id, self.dataset_id)
165
97
 
@@ -179,7 +111,25 @@ class BigQueryDestination(AbstractDestination):
179
111
  # https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
180
112
  # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
181
113
 
182
- def load_to_bigquery(self, gcs_file: str, destination_records: List[DestinationRecord]):
114
+ def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
115
+
116
+ if self.buffer_format == "parquet":
117
+
118
+ # Upload the Parquet file to GCS
119
+ file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
120
+
121
+ with io.BytesIO() as stream:
122
+ df_destination_records.write_parquet(stream)
123
+ stream.seek(0)
124
+
125
+ blob = self.buffer_bucket.blob(file_name)
126
+ blob.upload_from_file(stream, content_type="application/octet-stream")
127
+
128
+ return file_name
129
+
130
+ raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
131
+
132
+ def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
183
133
 
184
134
  # We always partition by the loaded_at field
185
135
  time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
@@ -187,34 +137,41 @@ class BigQueryDestination(AbstractDestination):
187
137
  job_config = bigquery.LoadJobConfig(
188
138
  source_format=bigquery.SourceFormat.PARQUET,
189
139
  write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
190
- schema=self.get_bigquery_schema(destination_records=destination_records),
140
+ schema=self.get_bigquery_schema(df_destination_records=df_destination_records),
191
141
  time_partitioning=time_partitioning,
192
142
  )
193
143
 
194
- if self.config.normalization.type == NormalizationType.DEBEZIUM:
195
- job_config.clustering_fields = list(
196
- json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
197
- )
198
-
199
144
  load_job = self.bq_client.load_table_from_uri(
200
145
  f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
201
146
  )
147
+ result = load_job.result() # Waits for the job to complete
148
+ assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
149
+
150
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
151
+
152
+ # Rename fields to match BigQuery schema
153
+ df_destination_records = df_destination_records.rename(
154
+ {
155
+ # Bizon fields
156
+ "bizon_extracted_at": "_bizon_extracted_at",
157
+ "bizon_id": "_bizon_id",
158
+ "bizon_loaded_at": "_bizon_loaded_at",
159
+ # Source fields
160
+ "source_record_id": "_source_record_id",
161
+ "source_timestamp": "_source_timestamp",
162
+ "source_data": "_source_data",
163
+ },
164
+ )
202
165
 
203
- load_job.result()
204
-
205
- def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
206
-
207
- # Here we can check if these IDs are already present in BigQuery
208
- # Using SourceRecord.id values
209
-
210
- gs_file_name = self.convert_and_upload_to_buffer(destination_records=destination_records)
166
+ gs_file_name = self.convert_and_upload_to_buffer(df_destination_records=df_destination_records)
211
167
 
212
168
  try:
213
- self.load_to_bigquery(gs_file_name, destination_records=destination_records)
169
+ self.load_to_bigquery(gcs_file=gs_file_name, df_destination_records=df_destination_records)
214
170
  self.cleanup(gs_file_name)
215
171
  except Exception as e:
216
172
  self.cleanup(gs_file_name)
217
173
  logger.error(f"Error loading data to BigQuery: {e}")
174
+ logger.error(traceback.format_exc())
218
175
  return False, str(e)
219
176
  return True, ""
220
177
 
@@ -10,11 +10,6 @@ from bizon.destinations.config import (
10
10
  )
11
11
 
12
12
 
13
- class GCSBufferFormat(str, Enum):
14
- PARQUET = "parquet"
15
- CSV = "csv"
16
-
17
-
18
13
  class TimePartitioning(str, Enum):
19
14
  DAY = "DAY"
20
15
  HOUR = "HOUR"
@@ -29,7 +24,7 @@ class BigQueryAuthentication(BaseModel):
29
24
  )
30
25
 
31
26
 
32
- class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
27
+ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
33
28
  project_id: str
34
29
  dataset_id: str
35
30
  dataset_location: Optional[str] = "US"
@@ -40,16 +35,9 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
40
35
  default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
41
36
  )
42
37
  authentication: Optional[BigQueryAuthentication] = None
43
-
44
- buffer_size: int = Field(default=0, description="Buffer size in MB")
45
-
46
- @field_validator("buffer_size", mode="after")
47
- def validate_buffer_size(cls, value: int) -> int:
48
- if value != 0:
49
- raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
50
- return value
38
+ bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
51
39
 
52
40
 
53
41
  class BigQueryStreamingConfig(AbstractDestinationConfig):
54
42
  name: Literal[DestinationTypes.BIGQUERY_STREAMING]
55
- config: BigQueryConfigDetails
43
+ config: BigQueryStreamingConfigDetails
@@ -1,29 +1,33 @@
1
- import json
2
1
  import os
3
2
  import tempfile
4
- from typing import List, Tuple
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from typing import List, Tuple, Type
5
5
 
6
+ import polars as pl
6
7
  from google.api_core.exceptions import NotFound
7
- from google.cloud import bigquery, bigquery_storage_v1, storage
8
+ from google.cloud import bigquery, bigquery_storage_v1
8
9
  from google.cloud.bigquery import DatasetReference, TimePartitioning
9
- from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
10
- from loguru import logger
10
+ from google.cloud.bigquery_storage_v1.types import (
11
+ AppendRowsRequest,
12
+ ProtoRows,
13
+ ProtoSchema,
14
+ )
15
+ from google.protobuf.message import Message
11
16
 
12
17
  from bizon.common.models import SyncMetadata
13
18
  from bizon.destinations.config import NormalizationType
14
19
  from bizon.destinations.destination import AbstractDestination
15
- from bizon.destinations.models import DestinationRecord
16
20
  from bizon.engine.backend.backend import AbstractBackend
17
21
 
18
- from .config import BigQueryConfigDetails
22
+ from .config import BigQueryStreamingConfigDetails
19
23
  from .proto_utils import get_proto_schema_and_class
20
24
 
21
25
 
22
26
  class BigQueryStreamingDestination(AbstractDestination):
23
27
 
24
- def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
28
+ def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
25
29
  super().__init__(sync_metadata, config, backend)
26
- self.config: BigQueryConfigDetails = config
30
+ self.config: BigQueryStreamingConfigDetails = config
27
31
 
28
32
  if config.authentication and config.authentication.service_account_key:
29
33
  with tempfile.NamedTemporaryFile(delete=False) as temp:
@@ -34,16 +38,16 @@ class BigQueryStreamingDestination(AbstractDestination):
34
38
  self.project_id = config.project_id
35
39
  self.bq_client = bigquery.Client(project=self.project_id)
36
40
  self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
37
- self.gcs_client = storage.Client(project=self.project_id)
38
41
  self.dataset_id = config.dataset_id
39
42
  self.dataset_location = config.dataset_location
43
+ self.bq_max_rows_per_request = config.bq_max_rows_per_request
40
44
 
41
45
  @property
42
46
  def table_id(self) -> str:
43
47
  tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
44
48
  return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
45
49
 
46
- def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
50
+ def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
47
51
 
48
52
  # we keep raw data in the column source_data
49
53
  if self.config.normalization.type == NormalizationType.NONE:
@@ -58,35 +62,6 @@ class BigQueryStreamingDestination(AbstractDestination):
58
62
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
59
63
  ]
60
64
 
61
- elif self.config.normalization.type == NormalizationType.DEBEZIUM:
62
- assert (
63
- "_bizon_message_key" in destination_records[0].source_data
64
- ), "Debezium records must have a '_bizon_message_key' key"
65
- message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
66
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
67
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
68
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
69
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
70
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
71
- bigquery.SchemaField(
72
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
73
- ),
74
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
75
- ]
76
-
77
- # If normalization is tabular, we parse key / value pairs to columns
78
- elif self.config.normalization.type == NormalizationType.TABULAR:
79
- first_record_keys = destination_records[0].source_data.keys()
80
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
81
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
82
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
83
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
84
- bigquery.SchemaField(
85
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
86
- ),
87
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
88
- ]
89
-
90
65
  raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
91
66
 
92
67
  def check_connection(self) -> bool:
@@ -100,21 +75,45 @@ class BigQueryStreamingDestination(AbstractDestination):
100
75
  dataset = self.bq_client.create_dataset(dataset)
101
76
  return True
102
77
 
103
- def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
78
+ def append_rows_to_stream(
79
+ self,
80
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
81
+ stream_name: str,
82
+ proto_schema: ProtoSchema,
83
+ serialized_rows: List[bytes],
84
+ ):
85
+ request = AppendRowsRequest(
86
+ write_stream=stream_name,
87
+ proto_rows=AppendRowsRequest.ProtoData(
88
+ rows=ProtoRows(serialized_rows=serialized_rows),
89
+ writer_schema=proto_schema,
90
+ ),
91
+ )
92
+ response = write_client.append_rows(iter([request]))
93
+ return response.code().name
94
+
95
+ @staticmethod
96
+ def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
97
+ """Convert a row to a protobuf serialization"""
98
+ record = TableRowClass()
99
+ record._bizon_id = row["bizon_id"]
100
+ record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
101
+ record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
102
+ record._source_record_id = row["source_record_id"]
103
+ record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
104
+ record._source_data = row["source_data"]
105
+ return record.SerializeToString()
106
+
107
+ def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
108
+ # TODO: for now no clustering keys
104
109
  clustering_keys = []
105
110
 
106
- if self.config.normalization.type == NormalizationType.DEBEZIUM:
107
- clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
108
-
109
111
  # Create table if it doesnt exist
110
- schema = self.get_bigquery_schema(destination_records=destination_records)
112
+ schema = self.get_bigquery_schema()
111
113
  table = bigquery.Table(self.table_id, schema=schema)
112
114
  time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
113
115
  table.time_partitioning = time_partitioning
114
116
 
115
- if clustering_keys:
116
- table.clustering_fields = clustering_keys
117
-
118
117
  table = self.bq_client.create_table(table, exists_ok=True)
119
118
 
120
119
  # Create the stream
@@ -127,22 +126,29 @@ class BigQueryStreamingDestination(AbstractDestination):
127
126
  proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
128
127
 
129
128
  serialized_rows = [
130
- record.to_protobuf_serialization(
131
- TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
132
- )
133
- for record in destination_records
129
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
130
+ for row in df_destination_records.iter_rows(named=True)
134
131
  ]
135
132
 
136
- request = AppendRowsRequest(
137
- write_stream=stream_name,
138
- proto_rows=AppendRowsRequest.ProtoData(
139
- rows=ProtoRows(serialized_rows=serialized_rows),
140
- writer_schema=proto_schema,
141
- ),
142
- )
143
- response = write_client.append_rows(iter([request]))
144
- assert response.code().name == "OK"
133
+ results = []
134
+ with ThreadPoolExecutor() as executor:
135
+ futures = [
136
+ executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
137
+ for batch_rows in self.batch(serialized_rows)
138
+ ]
139
+ for future in futures:
140
+ results.append(future.result())
141
+
142
+ assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
145
143
 
146
- def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
147
- self.load_to_bigquery_via_streaming(destination_records=destination_records)
144
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
145
+ self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
148
146
  return True, ""
147
+
148
+ def batch(self, iterable):
149
+ """
150
+ Yield successive batches of size `batch_size` from `iterable`.
151
+ """
152
+
153
+ for i in range(0, len(iterable), self.bq_max_rows_per_request):
154
+ yield iterable[i : i + self.bq_max_rows_per_request] # noqa
@@ -1,8 +1,11 @@
1
- import sys
2
1
  from datetime import datetime
3
2
  from typing import List
4
3
 
5
- from bizon.destinations.models import DestinationRecord
4
+ from loguru import logger
5
+ from polars import DataFrame
6
+ from pytz import UTC
7
+
8
+ from .models import destination_record_schema
6
9
 
7
10
 
8
11
  class DestinationBuffer:
@@ -10,15 +13,15 @@ class DestinationBuffer:
10
13
  def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
11
14
  self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
12
15
  self.buffer_flush_timeout = buffer_flush_timeout
13
- self.records: List[DestinationRecord] = []
16
+ self.df_destination_records: DataFrame = DataFrame(schema=destination_record_schema)
14
17
  self._iterations: List[int] = []
15
18
  self.pagination = {}
16
- self.modified_at: List[datetime] = [datetime.utcnow()]
19
+ self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
17
20
 
18
21
  @property
19
22
  def current_size(self) -> int:
20
23
  """Return buffer size"""
21
- return sys.getsizeof(self.records)
24
+ return self.df_destination_records.estimated_size(unit="b")
22
25
 
23
26
  @property
24
27
  def buffer_free_space_pct(self) -> float:
@@ -61,16 +64,20 @@ class DestinationBuffer:
61
64
 
62
65
  def flush(self):
63
66
  """Flush buffer"""
64
- self.records = []
67
+ self.df_destination_records = DataFrame(schema=destination_record_schema)
65
68
  self._iterations = []
66
69
  self.pagination = {}
67
70
  self.modified_at = []
68
71
 
69
72
  def add_source_iteration_records_to_buffer(
70
- self, iteration: int, records: List[DestinationRecord], pagination: dict = None
73
+ self, iteration: int, df_destination_records: DataFrame, pagination: dict = None
71
74
  ):
72
75
  """Add records for the given iteration to buffer"""
73
- self.records.extend(records)
76
+ self.df_destination_records.vstack(df_destination_records, in_place=True)
74
77
  self._iterations.append(iteration)
75
78
  self.pagination = pagination
76
- self.modified_at.append(datetime.utcnow())
79
+ self.modified_at.append(datetime.now(tz=UTC))
80
+
81
+ logger.info(
82
+ f"Added {df_destination_records.height} records to buffer for iteration {iteration} - {self.df_destination_records.estimated_size(unit='mb')} MB"
83
+ )