bizon 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/cli/main.py +14 -0
- bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml +34 -0
- bizon/connectors/destinations/bigquery/src/destination.py +5 -1
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +1 -38
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +50 -44
- bizon/connectors/destinations/file/config/file_incremental.example.yml +22 -0
- bizon/connectors/destinations/file/src/destination.py +59 -2
- bizon/connectors/destinations/logger/config/logger_incremental.example.yml +21 -0
- bizon/connectors/destinations/logger/src/destination.py +13 -1
- bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml +51 -0
- bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml +40 -0
- bizon/connectors/sources/notion/config/api_key_incremental.example.yml +48 -0
- bizon/connectors/sources/notion/src/source.py +343 -1
- bizon/engine/pipeline/producer.py +42 -1
- bizon/source/config.py +6 -0
- bizon/source/models.py +2 -1
- {bizon-0.2.0.dist-info → bizon-0.3.1.dist-info}/METADATA +125 -1
- {bizon-0.2.0.dist-info → bizon-0.3.1.dist-info}/RECORD +21 -15
- {bizon-0.2.0.dist-info → bizon-0.3.1.dist-info}/WHEEL +0 -0
- {bizon-0.2.0.dist-info → bizon-0.3.1.dist-info}/entry_points.txt +0 -0
- {bizon-0.2.0.dist-info → bizon-0.3.1.dist-info}/licenses/LICENSE +0 -0
bizon/cli/main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import click
|
|
2
|
+
from dotenv import find_dotenv, load_dotenv
|
|
2
3
|
|
|
3
4
|
from bizon.engine.engine import RunnerFactory
|
|
4
5
|
from bizon.engine.runner.config import LoggerLevel
|
|
@@ -95,15 +96,28 @@ def destination():
|
|
|
95
96
|
show_default=True,
|
|
96
97
|
help="Log level to use.",
|
|
97
98
|
)
|
|
99
|
+
@click.option(
|
|
100
|
+
"--env-file",
|
|
101
|
+
required=False,
|
|
102
|
+
type=click.Path(exists=True),
|
|
103
|
+
help="Path to .env file to load environment variables from.",
|
|
104
|
+
)
|
|
98
105
|
def run(
|
|
99
106
|
filename: str,
|
|
100
107
|
custom_source: str,
|
|
101
108
|
runner: str,
|
|
102
109
|
log_level: LoggerLevel,
|
|
110
|
+
env_file: str,
|
|
103
111
|
help="Run a bizon pipeline from a YAML file.",
|
|
104
112
|
):
|
|
105
113
|
"""Run a bizon pipeline from a YAML file."""
|
|
106
114
|
|
|
115
|
+
# Load environment variables from .env file
|
|
116
|
+
if env_file:
|
|
117
|
+
load_dotenv(env_file)
|
|
118
|
+
else:
|
|
119
|
+
load_dotenv(find_dotenv(".env"))
|
|
120
|
+
|
|
107
121
|
# Parse config from YAML file as a dictionary
|
|
108
122
|
config = parse_from_yaml(filename)
|
|
109
123
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: hubspot contacts to bigquery (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: hubspot
|
|
5
|
+
stream: contacts
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updatedAt # HubSpot's timestamp field for filtering
|
|
8
|
+
properties:
|
|
9
|
+
strategy: all
|
|
10
|
+
authentication:
|
|
11
|
+
type: api_key
|
|
12
|
+
api_key: <MY_API_KEY>
|
|
13
|
+
|
|
14
|
+
destination:
|
|
15
|
+
# Authentication: If empty it will be infered.
|
|
16
|
+
# Must have the bigquery.jobUser
|
|
17
|
+
# Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
|
|
18
|
+
name: bigquery
|
|
19
|
+
config:
|
|
20
|
+
buffer_size: 10 # in Mb
|
|
21
|
+
buffer_flush_timeout: 300 # in seconds
|
|
22
|
+
dataset_id: bizon_test
|
|
23
|
+
dataset_location: US
|
|
24
|
+
project_id: my-gcp-project-id
|
|
25
|
+
gcs_buffer_bucket: bizon-buffer
|
|
26
|
+
gcs_buffer_format: parquet
|
|
27
|
+
# Optional: service_account_key for explicit authentication
|
|
28
|
+
# service_account_key: >-
|
|
29
|
+
# { ... }
|
|
30
|
+
|
|
31
|
+
# How incremental sync works:
|
|
32
|
+
# 1. First run: Behaves like full_refresh (fetches all data)
|
|
33
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
34
|
+
# 3. Uses append-only strategy - new records are appended to existing data
|
|
@@ -210,7 +210,11 @@ class BigQueryDestination(AbstractDestination):
|
|
|
210
210
|
return True
|
|
211
211
|
|
|
212
212
|
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
213
|
-
#
|
|
213
|
+
# Append data from incremental temp table to main table
|
|
214
|
+
logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
|
|
215
|
+
self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
|
|
216
|
+
logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
|
|
217
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
214
218
|
return True
|
|
215
219
|
|
|
216
220
|
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from typing import List, Tuple
|
|
5
4
|
|
|
6
5
|
import orjson
|
|
@@ -162,39 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
162
161
|
response = write_client.append_rows(iter([request]))
|
|
163
162
|
return response.code().name
|
|
164
163
|
|
|
165
|
-
def safe_cast_record_values(self, row: dict):
|
|
166
|
-
"""
|
|
167
|
-
Safe cast record values to the correct type for BigQuery.
|
|
168
|
-
"""
|
|
169
|
-
for col in self.record_schemas[self.destination_id]:
|
|
170
|
-
# Handle dicts as strings
|
|
171
|
-
if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
|
|
172
|
-
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
173
|
-
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
174
|
-
|
|
175
|
-
# Handle timestamps
|
|
176
|
-
if (
|
|
177
|
-
col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
|
|
178
|
-
and col.default_value_expression is None
|
|
179
|
-
):
|
|
180
|
-
if isinstance(row[col.name], int):
|
|
181
|
-
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
182
|
-
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
183
|
-
"%Y-%m-%d %H:%M:%S.%f"
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
try:
|
|
187
|
-
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
188
|
-
except ValueError:
|
|
189
|
-
error_message = (
|
|
190
|
-
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
191
|
-
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
192
|
-
"Consider using a transformation."
|
|
193
|
-
)
|
|
194
|
-
logger.error(error_message)
|
|
195
|
-
raise ValueError(error_message)
|
|
196
|
-
return row
|
|
197
|
-
|
|
198
164
|
@retry(
|
|
199
165
|
retry=retry_if_exception_type(
|
|
200
166
|
(
|
|
@@ -281,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
281
247
|
|
|
282
248
|
if self.config.unnest:
|
|
283
249
|
# We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
|
|
284
|
-
rows_to_insert = [
|
|
285
|
-
self.safe_cast_record_values(orjson.loads(row))
|
|
286
|
-
for row in df_destination_records["source_data"].to_list()
|
|
287
|
-
]
|
|
250
|
+
rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
|
|
288
251
|
else:
|
|
289
252
|
df_destination_records = df_destination_records.with_columns(
|
|
290
253
|
pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
-
from datetime import datetime
|
|
5
4
|
from typing import List, Tuple, Type
|
|
6
5
|
|
|
7
6
|
import orjson
|
|
@@ -40,6 +39,7 @@ from bizon.destination.destination import AbstractDestination
|
|
|
40
39
|
from bizon.engine.backend.backend import AbstractBackend
|
|
41
40
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
42
41
|
from bizon.source.callback import AbstractSourceCallback
|
|
42
|
+
from bizon.source.config import SourceSyncModes
|
|
43
43
|
|
|
44
44
|
from .config import BigQueryStreamingV2ConfigDetails
|
|
45
45
|
from .proto_utils import get_proto_schema_and_class
|
|
@@ -81,6 +81,17 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
81
81
|
tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
82
82
|
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
83
83
|
|
|
84
|
+
@property
|
|
85
|
+
def temp_table_id(self) -> str:
|
|
86
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
87
|
+
return f"{self.table_id}_temp"
|
|
88
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
89
|
+
return f"{self.table_id}_incremental"
|
|
90
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
91
|
+
return f"{self.table_id}"
|
|
92
|
+
# Default fallback
|
|
93
|
+
return f"{self.table_id}"
|
|
94
|
+
|
|
84
95
|
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
85
96
|
if self.config.unnest:
|
|
86
97
|
if len(list(self.record_schemas.keys())) == 1:
|
|
@@ -165,36 +176,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
165
176
|
logger.error(f"Stream name: {stream_name}")
|
|
166
177
|
raise
|
|
167
178
|
|
|
168
|
-
def safe_cast_record_values(self, row: dict):
|
|
169
|
-
"""
|
|
170
|
-
Safe cast record values to the correct type for BigQuery.
|
|
171
|
-
"""
|
|
172
|
-
for col in self.record_schemas[self.destination_id]:
|
|
173
|
-
# Handle dicts as strings
|
|
174
|
-
if col.type in ["STRING", "JSON"]:
|
|
175
|
-
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
176
|
-
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
177
|
-
|
|
178
|
-
# Handle timestamps
|
|
179
|
-
if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
|
|
180
|
-
if isinstance(row[col.name], int):
|
|
181
|
-
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
182
|
-
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
183
|
-
"%Y-%m-%d %H:%M:%S.%f"
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
try:
|
|
187
|
-
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
188
|
-
except ValueError:
|
|
189
|
-
error_message = (
|
|
190
|
-
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
191
|
-
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
192
|
-
"Consider using a transformation."
|
|
193
|
-
)
|
|
194
|
-
logger.error(error_message)
|
|
195
|
-
raise ValueError(error_message)
|
|
196
|
-
return row
|
|
197
|
-
|
|
198
179
|
@staticmethod
|
|
199
180
|
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
200
181
|
"""Convert a row to a Protobuf serialization."""
|
|
@@ -263,14 +244,14 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
263
244
|
deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
|
|
264
245
|
deserialized_rows.append(deserialized_row)
|
|
265
246
|
|
|
266
|
-
# For large rows, we need to use the main client
|
|
247
|
+
# For large rows, we need to use the main client (write to temp_table_id)
|
|
267
248
|
job_config = bigquery.LoadJobConfig(
|
|
268
249
|
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
|
269
|
-
schema=self.bq_client.get_table(self.
|
|
250
|
+
schema=self.bq_client.get_table(self.temp_table_id).schema,
|
|
270
251
|
ignore_unknown_values=True,
|
|
271
252
|
)
|
|
272
253
|
load_job = self.bq_client.load_table_from_json(
|
|
273
|
-
deserialized_rows, self.
|
|
254
|
+
deserialized_rows, self.temp_table_id, job_config=job_config, timeout=300
|
|
274
255
|
)
|
|
275
256
|
result = load_job.result()
|
|
276
257
|
if load_job.state != "DONE":
|
|
@@ -292,9 +273,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
292
273
|
raise
|
|
293
274
|
|
|
294
275
|
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
295
|
-
# Create table if it does not exist
|
|
276
|
+
# Create table if it does not exist (use temp_table_id for staging)
|
|
296
277
|
schema = self.get_bigquery_schema()
|
|
297
|
-
table = bigquery.Table(self.
|
|
278
|
+
table = bigquery.Table(self.temp_table_id, schema=schema)
|
|
298
279
|
time_partitioning = TimePartitioning(
|
|
299
280
|
field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
|
|
300
281
|
)
|
|
@@ -305,7 +286,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
305
286
|
try:
|
|
306
287
|
table = self.bq_client.create_table(table)
|
|
307
288
|
except Conflict:
|
|
308
|
-
table = self.bq_client.get_table(self.
|
|
289
|
+
table = self.bq_client.get_table(self.temp_table_id)
|
|
309
290
|
# Compare and update schema if needed
|
|
310
291
|
existing_fields = {field.name: field for field in table.schema}
|
|
311
292
|
new_fields = {field.name: field for field in self.get_bigquery_schema()}
|
|
@@ -319,12 +300,13 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
319
300
|
table.schema = updated_schema
|
|
320
301
|
table = self.bq_client.update_table(table, ["schema"])
|
|
321
302
|
|
|
322
|
-
# Create the stream
|
|
323
|
-
|
|
324
|
-
|
|
303
|
+
# Create the stream (use temp_table_id for staging)
|
|
304
|
+
temp_table_parts = self.temp_table_id.split(".")
|
|
305
|
+
if len(temp_table_parts) == 3:
|
|
306
|
+
project, dataset, table_name = temp_table_parts
|
|
325
307
|
parent = BigQueryWriteClient.table_path(project, dataset, table_name)
|
|
326
308
|
else:
|
|
327
|
-
parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id,
|
|
309
|
+
parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, temp_table_parts[-1])
|
|
328
310
|
|
|
329
311
|
stream_name = f"{parent}/_default"
|
|
330
312
|
|
|
@@ -333,9 +315,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
333
315
|
|
|
334
316
|
if self.config.unnest:
|
|
335
317
|
serialized_rows = [
|
|
336
|
-
self.to_protobuf_serialization(
|
|
337
|
-
TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
|
|
338
|
-
)
|
|
318
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=orjson.loads(row))
|
|
339
319
|
for row in df_destination_records["source_data"].to_list()
|
|
340
320
|
]
|
|
341
321
|
else:
|
|
@@ -442,3 +422,29 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
442
422
|
if large_rows:
|
|
443
423
|
logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
444
424
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
425
|
+
|
|
426
|
+
def finalize(self):
|
|
427
|
+
"""Finalize the sync by moving data from temp table to main table based on sync mode."""
|
|
428
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
429
|
+
# Replace main table with temp table data
|
|
430
|
+
logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
|
|
431
|
+
self.bq_client.query(
|
|
432
|
+
f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
|
|
433
|
+
).result()
|
|
434
|
+
logger.info(f"Deleting temp table {self.temp_table_id} ...")
|
|
435
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
436
|
+
return True
|
|
437
|
+
|
|
438
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
439
|
+
# Append data from incremental temp table to main table
|
|
440
|
+
logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
|
|
441
|
+
self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
|
|
442
|
+
logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
|
|
443
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
444
|
+
return True
|
|
445
|
+
|
|
446
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
447
|
+
# Direct writes, no finalization needed
|
|
448
|
+
return True
|
|
449
|
+
|
|
450
|
+
return True
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: dummy to file (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: dummy
|
|
5
|
+
stream: creatures
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Field to filter records by timestamp
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: dummy_key
|
|
12
|
+
|
|
13
|
+
destination:
|
|
14
|
+
name: file
|
|
15
|
+
config:
|
|
16
|
+
format: json
|
|
17
|
+
|
|
18
|
+
# How incremental sync works with file destination:
|
|
19
|
+
# 1. First run: Behaves like full_refresh (creates new file)
|
|
20
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
21
|
+
# 3. New records are appended to the existing JSON file
|
|
22
|
+
# 4. Writes to temp file (_incremental.json) then appends to main file on finalize
|
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
1
3
|
from typing import Tuple
|
|
2
4
|
|
|
3
5
|
import orjson
|
|
4
6
|
import polars as pl
|
|
7
|
+
from loguru import logger
|
|
5
8
|
|
|
6
9
|
from bizon.common.models import SyncMetadata
|
|
7
10
|
from bizon.destination.destination import AbstractDestination
|
|
8
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
12
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
10
13
|
from bizon.source.callback import AbstractSourceCallback
|
|
14
|
+
from bizon.source.config import SourceSyncModes
|
|
11
15
|
|
|
12
16
|
from .config import FileDestinationDetailsConfig
|
|
13
17
|
|
|
@@ -24,6 +28,30 @@ class FileDestination(AbstractDestination):
|
|
|
24
28
|
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
25
29
|
self.config: FileDestinationDetailsConfig = config
|
|
26
30
|
|
|
31
|
+
@property
|
|
32
|
+
def file_path(self) -> str:
|
|
33
|
+
"""Main output file path."""
|
|
34
|
+
return f"{self.destination_id}.json"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def temp_file_path(self) -> str:
|
|
38
|
+
"""Temp file path for FULL_REFRESH mode."""
|
|
39
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
40
|
+
return f"{self.destination_id}_temp.json"
|
|
41
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
42
|
+
return f"{self.destination_id}_incremental.json"
|
|
43
|
+
return self.file_path
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def write_path(self) -> str:
|
|
47
|
+
"""Get the path to write to based on sync mode."""
|
|
48
|
+
if self.sync_metadata.sync_mode in [
|
|
49
|
+
SourceSyncModes.FULL_REFRESH.value,
|
|
50
|
+
SourceSyncModes.INCREMENTAL.value,
|
|
51
|
+
]:
|
|
52
|
+
return self.temp_file_path
|
|
53
|
+
return self.file_path
|
|
54
|
+
|
|
27
55
|
def check_connection(self) -> bool:
|
|
28
56
|
return True
|
|
29
57
|
|
|
@@ -34,7 +62,7 @@ class FileDestination(AbstractDestination):
|
|
|
34
62
|
if self.config.unnest:
|
|
35
63
|
schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
|
|
36
64
|
|
|
37
|
-
with open(
|
|
65
|
+
with open(self.write_path, "a") as f:
|
|
38
66
|
for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
|
|
39
67
|
assert set(value.keys()) == schema_keys, "Keys do not match the schema"
|
|
40
68
|
|
|
@@ -46,6 +74,35 @@ class FileDestination(AbstractDestination):
|
|
|
46
74
|
f.write(f"{orjson.dumps(row).decode('utf-8')}\n")
|
|
47
75
|
|
|
48
76
|
else:
|
|
49
|
-
|
|
77
|
+
# Append mode for incremental, overwrite for full refresh on first write
|
|
78
|
+
with open(self.write_path, "a") as f:
|
|
79
|
+
for record in df_destination_records.iter_rows(named=True):
|
|
80
|
+
f.write(f"{orjson.dumps(record).decode('utf-8')}\n")
|
|
50
81
|
|
|
51
82
|
return True, ""
|
|
83
|
+
|
|
84
|
+
def finalize(self) -> bool:
|
|
85
|
+
"""Finalize the sync by moving temp file to main file based on sync mode."""
|
|
86
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
87
|
+
# Replace main file with temp file
|
|
88
|
+
if os.path.exists(self.temp_file_path):
|
|
89
|
+
logger.info(f"File destination: Moving {self.temp_file_path} to {self.file_path}")
|
|
90
|
+
shutil.move(self.temp_file_path, self.file_path)
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
94
|
+
# Append temp file contents to main file
|
|
95
|
+
if os.path.exists(self.temp_file_path):
|
|
96
|
+
logger.info(f"File destination: Appending {self.temp_file_path} to {self.file_path}")
|
|
97
|
+
with open(self.file_path, "a") as main_file:
|
|
98
|
+
with open(self.temp_file_path) as temp_file:
|
|
99
|
+
main_file.write(temp_file.read())
|
|
100
|
+
os.remove(self.temp_file_path)
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
|
|
104
|
+
# Direct writes, no finalization needed
|
|
105
|
+
logger.info("File destination: STREAM sync batch completed")
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
return True
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: dummy to logger (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: dummy
|
|
5
|
+
stream: creatures
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Field to filter records by timestamp
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: dummy_key
|
|
12
|
+
|
|
13
|
+
destination:
|
|
14
|
+
name: logger
|
|
15
|
+
config:
|
|
16
|
+
dummy: dummy
|
|
17
|
+
|
|
18
|
+
# How incremental sync works:
|
|
19
|
+
# 1. First run: Behaves like full_refresh (fetches all data)
|
|
20
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
21
|
+
# 3. Logger outputs records with [incremental] prefix for easy identification
|
|
@@ -8,6 +8,7 @@ from bizon.destination.destination import AbstractDestination
|
|
|
8
8
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
9
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
10
10
|
from bizon.source.callback import AbstractSourceCallback
|
|
11
|
+
from bizon.source.config import SourceSyncModes
|
|
11
12
|
|
|
12
13
|
from .config import LoggerDestinationConfig
|
|
13
14
|
|
|
@@ -36,6 +37,17 @@ class LoggerDestination(AbstractDestination):
|
|
|
36
37
|
return True
|
|
37
38
|
|
|
38
39
|
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
40
|
+
sync_mode_label = f"[{self.sync_metadata.sync_mode}]" if self.sync_metadata.sync_mode else ""
|
|
39
41
|
for record in df_destination_records.iter_rows(named=True):
|
|
40
|
-
logger.info(record[
|
|
42
|
+
logger.info(f"{sync_mode_label} {record['source_data']}")
|
|
41
43
|
return True, ""
|
|
44
|
+
|
|
45
|
+
def finalize(self) -> bool:
|
|
46
|
+
"""Finalize the sync - logs completion message based on sync mode."""
|
|
47
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
48
|
+
logger.info("Logger destination: FULL_REFRESH sync completed")
|
|
49
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
50
|
+
logger.info("Logger destination: INCREMENTAL sync completed (records appended)")
|
|
51
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
|
|
52
|
+
logger.info("Logger destination: STREAM sync batch completed")
|
|
53
|
+
return True
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: gsheets incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: gsheets
|
|
5
|
+
stream: worksheet
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Column name in your sheet containing timestamps
|
|
8
|
+
spreadsheet_url: <MY_SPREADSHEET_URL>
|
|
9
|
+
worksheet_name: Sheet1
|
|
10
|
+
service_account_key: >-
|
|
11
|
+
{
|
|
12
|
+
"type": "service_account",
|
|
13
|
+
"project_id": "<MY_GCP_PROJECT>",
|
|
14
|
+
"private_key_id": "xxx",
|
|
15
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
|
|
16
|
+
"client_email": "bizon@<MY_GCP_PROJECT>.iam.gserviceaccount.com",
|
|
17
|
+
"client_id": "999999999999",
|
|
18
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
19
|
+
"token_uri": "https://oauth2.googleapis.com/token",
|
|
20
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
21
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...",
|
|
22
|
+
"universe_domain": "googleapis.com"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
destination:
|
|
26
|
+
name: bigquery
|
|
27
|
+
config:
|
|
28
|
+
project_id: <MY_GCP_PROJECT>
|
|
29
|
+
dataset_id: gsheets_data
|
|
30
|
+
dataset_location: US
|
|
31
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
32
|
+
gcs_buffer_format: parquet
|
|
33
|
+
|
|
34
|
+
engine:
|
|
35
|
+
backend:
|
|
36
|
+
type: bigquery
|
|
37
|
+
database: <MY_GCP_PROJECT>
|
|
38
|
+
schema: bizon_backend
|
|
39
|
+
syncCursorInDBEvery: 2
|
|
40
|
+
|
|
41
|
+
# Incremental sync for Google Sheets:
|
|
42
|
+
# - First run: Fetches all rows (full refresh behavior)
|
|
43
|
+
# - Subsequent runs: Only fetches rows where cursor_field > last_run
|
|
44
|
+
#
|
|
45
|
+
# IMPORTANT: Your Google Sheet must have a timestamp column for incremental sync.
|
|
46
|
+
# Common patterns:
|
|
47
|
+
# - Add an "updated_at" column with formula: =NOW() (updates on edit)
|
|
48
|
+
# - Use Google Apps Script to auto-update timestamps on row changes
|
|
49
|
+
# - Manually maintain a "last_modified" column
|
|
50
|
+
#
|
|
51
|
+
# If your sheet doesn't have timestamps, use sync_mode: full_refresh instead.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: hubspot contacts incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: hubspot
|
|
5
|
+
stream: contacts
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updatedAt # HubSpot's timestamp field for contacts
|
|
8
|
+
properties:
|
|
9
|
+
strategy: all
|
|
10
|
+
authentication:
|
|
11
|
+
type: api_key
|
|
12
|
+
params:
|
|
13
|
+
token: <MY_API_KEY>
|
|
14
|
+
|
|
15
|
+
destination:
|
|
16
|
+
name: bigquery
|
|
17
|
+
config:
|
|
18
|
+
project_id: <MY_GCP_PROJECT>
|
|
19
|
+
dataset_id: hubspot_data
|
|
20
|
+
dataset_location: US
|
|
21
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
22
|
+
gcs_buffer_format: parquet
|
|
23
|
+
|
|
24
|
+
engine:
|
|
25
|
+
backend:
|
|
26
|
+
type: bigquery
|
|
27
|
+
database: <MY_GCP_PROJECT>
|
|
28
|
+
schema: bizon_backend
|
|
29
|
+
syncCursorInDBEvery: 2
|
|
30
|
+
|
|
31
|
+
# Incremental sync for HubSpot:
|
|
32
|
+
# - First run: Fetches all contacts (full refresh behavior)
|
|
33
|
+
# - Subsequent runs: Only fetches contacts where updatedAt > last_run
|
|
34
|
+
#
|
|
35
|
+
# Common cursor fields by stream:
|
|
36
|
+
# - contacts: updatedAt
|
|
37
|
+
# - companies: updatedAt
|
|
38
|
+
# - deals: updatedAt
|
|
39
|
+
# - tickets: updatedAt
|
|
40
|
+
# - products: updatedAt
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: notion pages incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: notion
|
|
5
|
+
stream: pages # Options: databases, data_sources, pages, blocks, users
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: last_edited_time # Notion's timestamp field
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
|
|
12
|
+
|
|
13
|
+
# List of database IDs to fetch data from
|
|
14
|
+
database_ids:
|
|
15
|
+
- "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
16
|
+
|
|
17
|
+
# Number of results per API call (1-100, default: 100)
|
|
18
|
+
page_size: 100
|
|
19
|
+
|
|
20
|
+
destination:
|
|
21
|
+
name: bigquery
|
|
22
|
+
config:
|
|
23
|
+
project_id: <MY_GCP_PROJECT>
|
|
24
|
+
dataset_id: notion_data
|
|
25
|
+
dataset_location: US
|
|
26
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
27
|
+
gcs_buffer_format: parquet
|
|
28
|
+
|
|
29
|
+
engine:
|
|
30
|
+
backend:
|
|
31
|
+
type: bigquery
|
|
32
|
+
database: <MY_GCP_PROJECT>
|
|
33
|
+
schema: bizon_backend
|
|
34
|
+
syncCursorInDBEvery: 2
|
|
35
|
+
|
|
36
|
+
# Incremental sync for Notion:
|
|
37
|
+
# - First run: Fetches all pages/databases (full refresh behavior)
|
|
38
|
+
# - Subsequent runs: Only fetches items where last_edited_time > last_run
|
|
39
|
+
#
|
|
40
|
+
# Supported streams for incremental sync:
|
|
41
|
+
# - pages, all_pages: Uses Search API with last_edited_time filter
|
|
42
|
+
# - databases, all_databases: Uses Search API to find updated data_sources
|
|
43
|
+
# - blocks: First finds updated pages, then fetches their blocks
|
|
44
|
+
# - blocks_markdown, all_blocks_markdown: Same as blocks, converts to markdown
|
|
45
|
+
#
|
|
46
|
+
# Not supported (falls back to full refresh):
|
|
47
|
+
# - users: No timestamp filter available
|
|
48
|
+
# - data_sources: Use databases stream instead
|