bizon 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ name: hubspot contacts to bigquery (incremental)
2
+
3
+ source:
4
+ name: hubspot
5
+ stream: contacts
6
+ sync_mode: incremental
7
+ cursor_field: updatedAt # HubSpot's timestamp field for filtering
8
+ properties:
9
+ strategy: all
10
+ authentication:
11
+ type: api_key
12
+ api_key: <MY_API_KEY>
13
+
14
+ destination:
15
+ # Authentication: If empty it will be infered.
16
+ # Must have the bigquery.jobUser
17
+ # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
18
+ name: bigquery
19
+ config:
20
+ buffer_size: 10 # in Mb
21
+ buffer_flush_timeout: 300 # in seconds
22
+ dataset_id: bizon_test
23
+ dataset_location: US
24
+ project_id: my-gcp-project-id
25
+ gcs_buffer_bucket: bizon-buffer
26
+ gcs_buffer_format: parquet
27
+ # Optional: service_account_key for explicit authentication
28
+ # service_account_key: >-
29
+ # { ... }
30
+
31
+ # How incremental sync works:
32
+ # 1. First run: Behaves like full_refresh (fetches all data)
33
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
34
+ # 3. Uses append-only strategy - new records are appended to existing data
@@ -210,7 +210,11 @@ class BigQueryDestination(AbstractDestination):
210
210
  return True
211
211
 
212
212
  elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
213
- # TO DO: Implement incremental sync
213
+ # Append data from incremental temp table to main table
214
+ logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
215
+ self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
216
+ logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
217
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
214
218
  return True
215
219
 
216
220
  elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import tempfile
3
- from datetime import datetime
4
3
  from typing import List, Tuple
5
4
 
6
5
  import orjson
@@ -162,39 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
162
161
  response = write_client.append_rows(iter([request]))
163
162
  return response.code().name
164
163
 
165
- def safe_cast_record_values(self, row: dict):
166
- """
167
- Safe cast record values to the correct type for BigQuery.
168
- """
169
- for col in self.record_schemas[self.destination_id]:
170
- # Handle dicts as strings
171
- if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
172
- if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
173
- row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
174
-
175
- # Handle timestamps
176
- if (
177
- col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
178
- and col.default_value_expression is None
179
- ):
180
- if isinstance(row[col.name], int):
181
- if row[col.name] > datetime(9999, 12, 31).timestamp():
182
- row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
183
- "%Y-%m-%d %H:%M:%S.%f"
184
- )
185
- else:
186
- try:
187
- row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
188
- except ValueError:
189
- error_message = (
190
- f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
191
- f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
192
- "Consider using a transformation."
193
- )
194
- logger.error(error_message)
195
- raise ValueError(error_message)
196
- return row
197
-
198
164
  @retry(
199
165
  retry=retry_if_exception_type(
200
166
  (
@@ -281,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
281
247
 
282
248
  if self.config.unnest:
283
249
  # We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
284
- rows_to_insert = [
285
- self.safe_cast_record_values(orjson.loads(row))
286
- for row in df_destination_records["source_data"].to_list()
287
- ]
250
+ rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
288
251
  else:
289
252
  df_destination_records = df_destination_records.with_columns(
290
253
  pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
@@ -1,7 +1,6 @@
1
1
  import os
2
2
  import tempfile
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from datetime import datetime
5
4
  from typing import List, Tuple, Type
6
5
 
7
6
  import orjson
@@ -40,6 +39,7 @@ from bizon.destination.destination import AbstractDestination
40
39
  from bizon.engine.backend.backend import AbstractBackend
41
40
  from bizon.monitoring.monitor import AbstractMonitor
42
41
  from bizon.source.callback import AbstractSourceCallback
42
+ from bizon.source.config import SourceSyncModes
43
43
 
44
44
  from .config import BigQueryStreamingV2ConfigDetails
45
45
  from .proto_utils import get_proto_schema_and_class
@@ -81,6 +81,17 @@ class BigQueryStreamingV2Destination(AbstractDestination):
81
81
  tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
82
82
  return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
83
83
 
84
+ @property
85
+ def temp_table_id(self) -> str:
86
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
87
+ return f"{self.table_id}_temp"
88
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
89
+ return f"{self.table_id}_incremental"
90
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
91
+ return f"{self.table_id}"
92
+ # Default fallback
93
+ return f"{self.table_id}"
94
+
84
95
  def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
85
96
  if self.config.unnest:
86
97
  if len(list(self.record_schemas.keys())) == 1:
@@ -165,36 +176,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
165
176
  logger.error(f"Stream name: {stream_name}")
166
177
  raise
167
178
 
168
- def safe_cast_record_values(self, row: dict):
169
- """
170
- Safe cast record values to the correct type for BigQuery.
171
- """
172
- for col in self.record_schemas[self.destination_id]:
173
- # Handle dicts as strings
174
- if col.type in ["STRING", "JSON"]:
175
- if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
176
- row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
177
-
178
- # Handle timestamps
179
- if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
180
- if isinstance(row[col.name], int):
181
- if row[col.name] > datetime(9999, 12, 31).timestamp():
182
- row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
183
- "%Y-%m-%d %H:%M:%S.%f"
184
- )
185
- else:
186
- try:
187
- row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
188
- except ValueError:
189
- error_message = (
190
- f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
191
- f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
192
- "Consider using a transformation."
193
- )
194
- logger.error(error_message)
195
- raise ValueError(error_message)
196
- return row
197
-
198
179
  @staticmethod
199
180
  def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
200
181
  """Convert a row to a Protobuf serialization."""
@@ -263,14 +244,14 @@ class BigQueryStreamingV2Destination(AbstractDestination):
263
244
  deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
264
245
  deserialized_rows.append(deserialized_row)
265
246
 
266
- # For large rows, we need to use the main client
247
+ # For large rows, we need to use the main client (write to temp_table_id)
267
248
  job_config = bigquery.LoadJobConfig(
268
249
  source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
269
- schema=self.bq_client.get_table(self.table_id).schema,
250
+ schema=self.bq_client.get_table(self.temp_table_id).schema,
270
251
  ignore_unknown_values=True,
271
252
  )
272
253
  load_job = self.bq_client.load_table_from_json(
273
- deserialized_rows, self.table_id, job_config=job_config, timeout=300
254
+ deserialized_rows, self.temp_table_id, job_config=job_config, timeout=300
274
255
  )
275
256
  result = load_job.result()
276
257
  if load_job.state != "DONE":
@@ -292,9 +273,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
292
273
  raise
293
274
 
294
275
  def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
295
- # Create table if it does not exist
276
+ # Create table if it does not exist (use temp_table_id for staging)
296
277
  schema = self.get_bigquery_schema()
297
- table = bigquery.Table(self.table_id, schema=schema)
278
+ table = bigquery.Table(self.temp_table_id, schema=schema)
298
279
  time_partitioning = TimePartitioning(
299
280
  field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
300
281
  )
@@ -305,7 +286,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
305
286
  try:
306
287
  table = self.bq_client.create_table(table)
307
288
  except Conflict:
308
- table = self.bq_client.get_table(self.table_id)
289
+ table = self.bq_client.get_table(self.temp_table_id)
309
290
  # Compare and update schema if needed
310
291
  existing_fields = {field.name: field for field in table.schema}
311
292
  new_fields = {field.name: field for field in self.get_bigquery_schema()}
@@ -319,12 +300,13 @@ class BigQueryStreamingV2Destination(AbstractDestination):
319
300
  table.schema = updated_schema
320
301
  table = self.bq_client.update_table(table, ["schema"])
321
302
 
322
- # Create the stream
323
- if self.destination_id:
324
- project, dataset, table_name = self.destination_id.split(".")
303
+ # Create the stream (use temp_table_id for staging)
304
+ temp_table_parts = self.temp_table_id.split(".")
305
+ if len(temp_table_parts) == 3:
306
+ project, dataset, table_name = temp_table_parts
325
307
  parent = BigQueryWriteClient.table_path(project, dataset, table_name)
326
308
  else:
327
- parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
309
+ parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, temp_table_parts[-1])
328
310
 
329
311
  stream_name = f"{parent}/_default"
330
312
 
@@ -333,9 +315,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
333
315
 
334
316
  if self.config.unnest:
335
317
  serialized_rows = [
336
- self.to_protobuf_serialization(
337
- TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
338
- )
318
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=orjson.loads(row))
339
319
  for row in df_destination_records["source_data"].to_list()
340
320
  ]
341
321
  else:
@@ -442,3 +422,29 @@ class BigQueryStreamingV2Destination(AbstractDestination):
442
422
  if large_rows:
443
423
  logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
444
424
  yield {"stream_batch": current_batch, "json_batch": large_rows}
425
+
426
+ def finalize(self):
427
+ """Finalize the sync by moving data from temp table to main table based on sync mode."""
428
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
429
+ # Replace main table with temp table data
430
+ logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
431
+ self.bq_client.query(
432
+ f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
433
+ ).result()
434
+ logger.info(f"Deleting temp table {self.temp_table_id} ...")
435
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
436
+ return True
437
+
438
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
439
+ # Append data from incremental temp table to main table
440
+ logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
441
+ self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
442
+ logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
443
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
444
+ return True
445
+
446
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
447
+ # Direct writes, no finalization needed
448
+ return True
449
+
450
+ return True
@@ -0,0 +1,22 @@
1
+ name: dummy to file (incremental)
2
+
3
+ source:
4
+ name: dummy
5
+ stream: creatures
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Field to filter records by timestamp
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: dummy_key
12
+
13
+ destination:
14
+ name: file
15
+ config:
16
+ format: json
17
+
18
+ # How incremental sync works with file destination:
19
+ # 1. First run: Behaves like full_refresh (creates new file)
20
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
21
+ # 3. New records are appended to the existing JSON file
22
+ # 4. Writes to temp file (_incremental.json) then appends to main file on finalize
@@ -1,13 +1,17 @@
1
+ import os
2
+ import shutil
1
3
  from typing import Tuple
2
4
 
3
5
  import orjson
4
6
  import polars as pl
7
+ from loguru import logger
5
8
 
6
9
  from bizon.common.models import SyncMetadata
7
10
  from bizon.destination.destination import AbstractDestination
8
11
  from bizon.engine.backend.backend import AbstractBackend
9
12
  from bizon.monitoring.monitor import AbstractMonitor
10
13
  from bizon.source.callback import AbstractSourceCallback
14
+ from bizon.source.config import SourceSyncModes
11
15
 
12
16
  from .config import FileDestinationDetailsConfig
13
17
 
@@ -24,6 +28,30 @@ class FileDestination(AbstractDestination):
24
28
  super().__init__(sync_metadata, config, backend, source_callback, monitor)
25
29
  self.config: FileDestinationDetailsConfig = config
26
30
 
31
+ @property
32
+ def file_path(self) -> str:
33
+ """Main output file path."""
34
+ return f"{self.destination_id}.json"
35
+
36
+ @property
37
+ def temp_file_path(self) -> str:
38
+ """Temp file path for FULL_REFRESH mode."""
39
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
40
+ return f"{self.destination_id}_temp.json"
41
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
42
+ return f"{self.destination_id}_incremental.json"
43
+ return self.file_path
44
+
45
+ @property
46
+ def write_path(self) -> str:
47
+ """Get the path to write to based on sync mode."""
48
+ if self.sync_metadata.sync_mode in [
49
+ SourceSyncModes.FULL_REFRESH.value,
50
+ SourceSyncModes.INCREMENTAL.value,
51
+ ]:
52
+ return self.temp_file_path
53
+ return self.file_path
54
+
27
55
  def check_connection(self) -> bool:
28
56
  return True
29
57
 
@@ -34,7 +62,7 @@ class FileDestination(AbstractDestination):
34
62
  if self.config.unnest:
35
63
  schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
36
64
 
37
- with open(f"{self.destination_id}.json", "a") as f:
65
+ with open(self.write_path, "a") as f:
38
66
  for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
39
67
  assert set(value.keys()) == schema_keys, "Keys do not match the schema"
40
68
 
@@ -46,6 +74,35 @@ class FileDestination(AbstractDestination):
46
74
  f.write(f"{orjson.dumps(row).decode('utf-8')}\n")
47
75
 
48
76
  else:
49
- df_destination_records.write_ndjson(f"{self.destination_id}.json")
77
+ # Append mode for incremental, overwrite for full refresh on first write
78
+ with open(self.write_path, "a") as f:
79
+ for record in df_destination_records.iter_rows(named=True):
80
+ f.write(f"{orjson.dumps(record).decode('utf-8')}\n")
50
81
 
51
82
  return True, ""
83
+
84
+ def finalize(self) -> bool:
85
+ """Finalize the sync by moving temp file to main file based on sync mode."""
86
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
87
+ # Replace main file with temp file
88
+ if os.path.exists(self.temp_file_path):
89
+ logger.info(f"File destination: Moving {self.temp_file_path} to {self.file_path}")
90
+ shutil.move(self.temp_file_path, self.file_path)
91
+ return True
92
+
93
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
94
+ # Append temp file contents to main file
95
+ if os.path.exists(self.temp_file_path):
96
+ logger.info(f"File destination: Appending {self.temp_file_path} to {self.file_path}")
97
+ with open(self.file_path, "a") as main_file:
98
+ with open(self.temp_file_path) as temp_file:
99
+ main_file.write(temp_file.read())
100
+ os.remove(self.temp_file_path)
101
+ return True
102
+
103
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
104
+ # Direct writes, no finalization needed
105
+ logger.info("File destination: STREAM sync batch completed")
106
+ return True
107
+
108
+ return True
@@ -0,0 +1,21 @@
1
+ name: dummy to logger (incremental)
2
+
3
+ source:
4
+ name: dummy
5
+ stream: creatures
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Field to filter records by timestamp
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: dummy_key
12
+
13
+ destination:
14
+ name: logger
15
+ config:
16
+ dummy: dummy
17
+
18
+ # How incremental sync works:
19
+ # 1. First run: Behaves like full_refresh (fetches all data)
20
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
21
+ # 3. Logger outputs records with [incremental] prefix for easy identification
@@ -8,6 +8,7 @@ from bizon.destination.destination import AbstractDestination
8
8
  from bizon.engine.backend.backend import AbstractBackend
9
9
  from bizon.monitoring.monitor import AbstractMonitor
10
10
  from bizon.source.callback import AbstractSourceCallback
11
+ from bizon.source.config import SourceSyncModes
11
12
 
12
13
  from .config import LoggerDestinationConfig
13
14
 
@@ -36,6 +37,17 @@ class LoggerDestination(AbstractDestination):
36
37
  return True
37
38
 
38
39
  def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
40
+ sync_mode_label = f"[{self.sync_metadata.sync_mode}]" if self.sync_metadata.sync_mode else ""
39
41
  for record in df_destination_records.iter_rows(named=True):
40
- logger.info(record["source_data"])
42
+ logger.info(f"{sync_mode_label} {record['source_data']}")
41
43
  return True, ""
44
+
45
+ def finalize(self) -> bool:
46
+ """Finalize the sync - logs completion message based on sync mode."""
47
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
48
+ logger.info("Logger destination: FULL_REFRESH sync completed")
49
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
50
+ logger.info("Logger destination: INCREMENTAL sync completed (records appended)")
51
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
52
+ logger.info("Logger destination: STREAM sync batch completed")
53
+ return True
@@ -0,0 +1,51 @@
1
+ name: gsheets incremental sync
2
+
3
+ source:
4
+ name: gsheets
5
+ stream: worksheet
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Column name in your sheet containing timestamps
8
+ spreadsheet_url: <MY_SPREADSHEET_URL>
9
+ worksheet_name: Sheet1
10
+ service_account_key: >-
11
+ {
12
+ "type": "service_account",
13
+ "project_id": "<MY_GCP_PROJECT>",
14
+ "private_key_id": "xxx",
15
+ "private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
16
+ "client_email": "bizon@<MY_GCP_PROJECT>.iam.gserviceaccount.com",
17
+ "client_id": "999999999999",
18
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
19
+ "token_uri": "https://oauth2.googleapis.com/token",
20
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
21
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...",
22
+ "universe_domain": "googleapis.com"
23
+ }
24
+
25
+ destination:
26
+ name: bigquery
27
+ config:
28
+ project_id: <MY_GCP_PROJECT>
29
+ dataset_id: gsheets_data
30
+ dataset_location: US
31
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
32
+ gcs_buffer_format: parquet
33
+
34
+ engine:
35
+ backend:
36
+ type: bigquery
37
+ database: <MY_GCP_PROJECT>
38
+ schema: bizon_backend
39
+ syncCursorInDBEvery: 2
40
+
41
+ # Incremental sync for Google Sheets:
42
+ # - First run: Fetches all rows (full refresh behavior)
43
+ # - Subsequent runs: Only fetches rows where cursor_field > last_run
44
+ #
45
+ # IMPORTANT: Your Google Sheet must have a timestamp column for incremental sync.
46
+ # Common patterns:
47
+ # - Add an "updated_at" column with formula: =NOW() (updates on edit)
48
+ # - Use Google Apps Script to auto-update timestamps on row changes
49
+ # - Manually maintain a "last_modified" column
50
+ #
51
+ # If your sheet doesn't have timestamps, use sync_mode: full_refresh instead.
@@ -0,0 +1,40 @@
1
+ name: hubspot contacts incremental sync
2
+
3
+ source:
4
+ name: hubspot
5
+ stream: contacts
6
+ sync_mode: incremental
7
+ cursor_field: updatedAt # HubSpot's timestamp field for contacts
8
+ properties:
9
+ strategy: all
10
+ authentication:
11
+ type: api_key
12
+ params:
13
+ token: <MY_API_KEY>
14
+
15
+ destination:
16
+ name: bigquery
17
+ config:
18
+ project_id: <MY_GCP_PROJECT>
19
+ dataset_id: hubspot_data
20
+ dataset_location: US
21
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
22
+ gcs_buffer_format: parquet
23
+
24
+ engine:
25
+ backend:
26
+ type: bigquery
27
+ database: <MY_GCP_PROJECT>
28
+ schema: bizon_backend
29
+ syncCursorInDBEvery: 2
30
+
31
+ # Incremental sync for HubSpot:
32
+ # - First run: Fetches all contacts (full refresh behavior)
33
+ # - Subsequent runs: Only fetches contacts where updatedAt > last_run
34
+ #
35
+ # Common cursor fields by stream:
36
+ # - contacts: updatedAt
37
+ # - companies: updatedAt
38
+ # - deals: updatedAt
39
+ # - tickets: updatedAt
40
+ # - products: updatedAt
@@ -0,0 +1,48 @@
1
+ name: notion pages incremental sync
2
+
3
+ source:
4
+ name: notion
5
+ stream: pages # Options: databases, data_sources, pages, blocks, users
6
+ sync_mode: incremental
7
+ cursor_field: last_edited_time # Notion's timestamp field
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
12
+
13
+ # List of database IDs to fetch data from
14
+ database_ids:
15
+ - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
16
+
17
+ # Number of results per API call (1-100, default: 100)
18
+ page_size: 100
19
+
20
+ destination:
21
+ name: bigquery
22
+ config:
23
+ project_id: <MY_GCP_PROJECT>
24
+ dataset_id: notion_data
25
+ dataset_location: US
26
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
27
+ gcs_buffer_format: parquet
28
+
29
+ engine:
30
+ backend:
31
+ type: bigquery
32
+ database: <MY_GCP_PROJECT>
33
+ schema: bizon_backend
34
+ syncCursorInDBEvery: 2
35
+
36
+ # Incremental sync for Notion:
37
+ # - First run: Fetches all pages/databases (full refresh behavior)
38
+ # - Subsequent runs: Only fetches items where last_edited_time > last_run
39
+ #
40
+ # Supported streams for incremental sync:
41
+ # - pages, all_pages: Uses Search API with last_edited_time filter
42
+ # - databases, all_databases: Uses Search API to find updated data_sources
43
+ # - blocks: First finds updated pages, then fetches their blocks
44
+ # - blocks_markdown, all_blocks_markdown: Same as blocks, converts to markdown
45
+ #
46
+ # Not supported (falls back to full refresh):
47
+ # - users: No timestamp filter available
48
+ # - data_sources: Use databases stream instead
@@ -10,7 +10,7 @@ from urllib3.util.retry import Retry
10
10
  from bizon.source.auth.builder import AuthBuilder
11
11
  from bizon.source.auth.config import AuthType
12
12
  from bizon.source.config import SourceConfig
13
- from bizon.source.models import SourceIteration, SourceRecord
13
+ from bizon.source.models import SourceIncrementalState, SourceIteration, SourceRecord
14
14
  from bizon.source.source import AbstractSource
15
15
 
16
16
  from .config import NotionSourceConfig, NotionStreams
@@ -1132,6 +1132,348 @@ class NotionSource(AbstractSource):
1132
1132
 
1133
1133
  return SourceIteration(records=records, next_pagination=next_pagination)
1134
1134
 
1135
+ # ==================== INCREMENTAL SYNC ====================
1136
+
1137
+ def search_with_filter(
1138
+ self, start_cursor: str = None, last_edited_after: str = None, object_type: str = None
1139
+ ) -> dict:
1140
+ """
1141
+ Search with optional last_edited_time filter for incremental sync.
1142
+
1143
+ Note: Notion Search API doesn't support timestamp filtering directly.
1144
+ We sort by last_edited_time descending and filter client-side.
1145
+
1146
+ Args:
1147
+ start_cursor: Pagination cursor
1148
+ last_edited_after: ISO 8601 timestamp to filter by last_edited_time
1149
+ object_type: Optional filter by object type ("page" or "database")
1150
+
1151
+ Returns:
1152
+ Search results filtered by timestamp
1153
+ """
1154
+ payload = {"page_size": self.config.page_size}
1155
+ if start_cursor:
1156
+ payload["start_cursor"] = start_cursor
1157
+
1158
+ # Sort by last_edited_time descending to get most recent first
1159
+ if last_edited_after:
1160
+ payload["sort"] = {"direction": "descending", "timestamp": "last_edited_time"}
1161
+
1162
+ response = self.session.post(f"{BASE_URL}/search", json=payload)
1163
+ response.raise_for_status()
1164
+ result = response.json()
1165
+
1166
+ # Filter by object_type client-side if specified
1167
+ if object_type:
1168
+ result["results"] = [item for item in result.get("results", []) if item.get("object") == object_type]
1169
+
1170
+ # Filter by last_edited_time client-side
1171
+ # Since results are sorted descending, stop when we hit an old item
1172
+ if last_edited_after:
1173
+ filtered_results = []
1174
+ found_old_item = False
1175
+ for item in result.get("results", []):
1176
+ item_edited_time = item.get("last_edited_time", "")
1177
+ if item_edited_time > last_edited_after:
1178
+ filtered_results.append(item)
1179
+ else:
1180
+ found_old_item = True
1181
+ break
1182
+
1183
+ result["results"] = filtered_results
1184
+ # If we found an old item, no need to paginate further
1185
+ if found_old_item:
1186
+ result["has_more"] = False
1187
+
1188
+ return result
1189
+
1190
+ def get_pages_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
1191
+ """
1192
+ Fetch pages updated after source_state.last_run using the Search API with timestamp filter.
1193
+ """
1194
+ cursor = pagination.get("start_cursor") if pagination else None
1195
+ last_edited_after = source_state.last_run.isoformat()
1196
+
1197
+ result = self.search_with_filter(start_cursor=cursor, last_edited_after=last_edited_after, object_type="page")
1198
+
1199
+ records = [SourceRecord(id=page["id"], data=page) for page in result.get("results", [])]
1200
+
1201
+ logger.info(f"Incremental sync: fetched {len(records)} pages updated after {last_edited_after}")
1202
+
1203
+ next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
1204
+
1205
+ return SourceIteration(records=records, next_pagination=next_pagination)
1206
+
1207
+ def get_all_pages_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
1208
+ """
1209
+ Fetch all pages accessible to the integration updated after source_state.last_run.
1210
+ Same as get_pages_after but without database_ids filter.
1211
+ """
1212
+ return self.get_pages_after(source_state, pagination)
1213
+
1214
+ def get_databases_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
1215
+ """
1216
+ Fetch databases updated after source_state.last_run.
1217
+ """
1218
+ cursor = pagination.get("start_cursor") if pagination else None
1219
+ last_edited_after = source_state.last_run.isoformat()
1220
+
1221
+ # Search for data_sources (databases don't appear directly in search in 2025-09-03 API)
1222
+ result = self.search_with_filter(
1223
+ start_cursor=cursor, last_edited_after=last_edited_after, object_type="data_source"
1224
+ )
1225
+
1226
+ # Extract unique database IDs from data_sources
1227
+ seen_db_ids = set()
1228
+ records = []
1229
+ for ds in result.get("results", []):
1230
+ parent = ds.get("parent", {})
1231
+ if parent.get("type") == "database_id":
1232
+ db_id = parent.get("database_id")
1233
+ if db_id and db_id not in seen_db_ids:
1234
+ seen_db_ids.add(db_id)
1235
+ try:
1236
+ db_data = self.get_database(db_id)
1237
+ records.append(SourceRecord(id=db_data["id"], data=db_data))
1238
+ except Exception as e:
1239
+ logger.error(f"Failed to fetch database {db_id}: {e}")
1240
+
1241
+ logger.info(f"Incremental sync: fetched {len(records)} databases updated after {last_edited_after}")
1242
+
1243
+ next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
1244
+
1245
+ return SourceIteration(records=records, next_pagination=next_pagination)
1246
+
1247
+ def get_blocks_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
1248
+ """
1249
+ Fetch blocks from pages updated after source_state.last_run.
1250
+ First finds updated pages, then fetches their blocks.
1251
+ """
1252
+ if pagination:
1253
+ items_to_process = pagination.get("items_to_process", [])
1254
+ items_loaded = pagination.get("items_loaded", False)
1255
+ search_cursor = pagination.get("search_cursor")
1256
+ else:
1257
+ items_to_process = []
1258
+ items_loaded = False
1259
+ search_cursor = None
1260
+
1261
+ last_edited_after = source_state.last_run.isoformat()
1262
+
1263
+ # Collect pages updated after last_run
1264
+ if not items_loaded:
1265
+ while True:
1266
+ result = self.search_with_filter(
1267
+ start_cursor=search_cursor, last_edited_after=last_edited_after, object_type="page"
1268
+ )
1269
+ for page in result.get("results", []):
1270
+ items_to_process.append(
1271
+ {
1272
+ "block_id": page["id"],
1273
+ "input_db_id": None,
1274
+ "input_page_id": None,
1275
+ "source_page_id": page["id"],
1276
+ }
1277
+ )
1278
+
1279
+ if result.get("has_more"):
1280
+ search_cursor = result.get("next_cursor")
1281
+ else:
1282
+ break
1283
+
1284
+ items_loaded = True
1285
+ logger.info(f"Incremental sync: found {len(items_to_process)} pages updated after {last_edited_after}")
1286
+
1287
+ if not items_to_process:
1288
+ return SourceIteration(records=[], next_pagination={})
1289
+
1290
+ # Process a batch in parallel
1291
+ batch_size = self.config.max_workers
1292
+ batch = items_to_process[:batch_size]
1293
+ items_to_process = items_to_process[batch_size:]
1294
+
1295
+ records = []
1296
+
1297
+ def fetch_item_blocks(item_info: dict) -> List[dict]:
1298
+ return self.fetch_blocks_recursively(
1299
+ block_id=item_info["block_id"],
1300
+ parent_input_database_id=item_info["input_db_id"],
1301
+ parent_input_page_id=item_info["input_page_id"],
1302
+ source_page_id=item_info["source_page_id"],
1303
+ )
1304
+
1305
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1306
+ futures = {executor.submit(fetch_item_blocks, item_info): item_info for item_info in batch}
1307
+ for future in as_completed(futures):
1308
+ item_info = futures[future]
1309
+ try:
1310
+ blocks = future.result()
1311
+ for block in blocks:
1312
+ records.append(SourceRecord(id=block["id"], data=block))
1313
+ except Exception as e:
1314
+ logger.error(f"Failed to fetch blocks from {item_info['block_id']}: {e}")
1315
+
1316
+ next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
1317
+
1318
+ return SourceIteration(records=records, next_pagination=next_pagination)
1319
+
1320
+ def get_blocks_markdown_after(
1321
+ self, source_state: SourceIncrementalState, pagination: dict = None
1322
+ ) -> SourceIteration:
1323
+ """
1324
+ Fetch blocks from pages updated after source_state.last_run and convert to markdown.
1325
+ Respects database_ids and database_filters configuration.
1326
+ """
1327
+ if pagination:
1328
+ items_to_process = pagination.get("items_to_process", [])
1329
+ items_loaded = pagination.get("items_loaded", False)
1330
+ else:
1331
+ items_to_process = []
1332
+ items_loaded = False
1333
+
1334
+ last_edited_after = source_state.last_run.isoformat()
1335
+
1336
+ # Collect pages updated after last_run from configured databases
1337
+ if not items_loaded:
1338
+ # Query each configured database with timestamp filter
1339
+ for db_id in self.config.database_ids:
1340
+ try:
1341
+ db_data = self.get_database(db_id)
1342
+ db_filter = self.get_filter_for_database(db_id)
1343
+
1344
+ for ds in db_data.get("data_sources", []):
1345
+ ds_cursor = None
1346
+ while True:
1347
+ # Build filter with last_edited_time constraint
1348
+ incremental_filter = {
1349
+ "timestamp": "last_edited_time",
1350
+ "last_edited_time": {"after": last_edited_after},
1351
+ }
1352
+ # Combine with existing database filter if present
1353
+ if db_filter:
1354
+ combined_filter = {"and": [incremental_filter, db_filter]}
1355
+ else:
1356
+ combined_filter = incremental_filter
1357
+
1358
+ result = self.query_data_source(ds["id"], ds_cursor, filter=combined_filter)
1359
+ for page in result.get("results", []):
1360
+ items_to_process.append(
1361
+ {
1362
+ "block_id": page["id"],
1363
+ "input_db_id": db_id,
1364
+ "input_page_id": None,
1365
+ "source_page_id": page["id"],
1366
+ }
1367
+ )
1368
+
1369
+ if result.get("has_more"):
1370
+ ds_cursor = result.get("next_cursor")
1371
+ else:
1372
+ break
1373
+ except Exception as e:
1374
+ logger.error(f"Failed to query database {db_id} for incremental sync: {e}")
1375
+
1376
+ # Also check configured page_ids (filter by last_edited_time)
1377
+ for page_id in self.config.page_ids:
1378
+ try:
1379
+ page_data = self.get_page(page_id)
1380
+ if page_data.get("last_edited_time", "") > last_edited_after:
1381
+ items_to_process.append(
1382
+ {
1383
+ "block_id": page_id,
1384
+ "input_db_id": None,
1385
+ "input_page_id": page_id,
1386
+ "source_page_id": page_id,
1387
+ }
1388
+ )
1389
+ except Exception as e:
1390
+ logger.error(f"Failed to fetch page {page_id} for incremental sync: {e}")
1391
+
1392
+ items_loaded = True
1393
+ logger.info(
1394
+ f"Incremental sync: found {len(items_to_process)} pages for blocks_markdown after {last_edited_after}"
1395
+ )
1396
+
1397
+ if not items_to_process:
1398
+ return SourceIteration(records=[], next_pagination={})
1399
+
1400
+ # Process a batch in parallel
1401
+ batch_size = self.config.max_workers
1402
+ batch = items_to_process[:batch_size]
1403
+ items_to_process = items_to_process[batch_size:]
1404
+
1405
+ records = []
1406
+
1407
+ def fetch_and_convert_item(item_info: dict) -> List[dict]:
1408
+ blocks = self.fetch_blocks_recursively(
1409
+ block_id=item_info["block_id"],
1410
+ parent_input_database_id=item_info["input_db_id"],
1411
+ parent_input_page_id=item_info["input_page_id"],
1412
+ source_page_id=item_info["source_page_id"],
1413
+ fetch_child_databases=False,
1414
+ )
1415
+
1416
+ block_records = []
1417
+ for block in blocks or []:
1418
+ if not block:
1419
+ continue
1420
+ md = self._block_to_markdown(block)
1421
+ block_records.append(
1422
+ {
1423
+ "block_id": block.get("id"),
1424
+ "block_type": block.get("type"),
1425
+ "markdown": md,
1426
+ "source_page_id": block.get("source_page_id"),
1427
+ "parent_block_id": block.get("parent_block_id"),
1428
+ "parent_input_database_id": block.get("parent_input_database_id"),
1429
+ "parent_input_page_id": block.get("parent_input_page_id"),
1430
+ "depth": block.get("depth"),
1431
+ "block_order": block.get("block_order"),
1432
+ "page_order": block.get("page_order"),
1433
+ "block_raw": block,
1434
+ }
1435
+ )
1436
+ return block_records
1437
+
1438
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1439
+ futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
1440
+ for future in as_completed(futures):
1441
+ item_info = futures[future]
1442
+ try:
1443
+ block_records = future.result()
1444
+ for block_record in block_records:
1445
+ records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
1446
+ except Exception as e:
1447
+ logger.error(f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}")
1448
+
1449
+ next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
1450
+
1451
+ return SourceIteration(records=records, next_pagination=next_pagination)
1452
+
1453
+ def get_records_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
1454
+ """
1455
+ Fetch records updated after source_state.last_run for incremental sync.
1456
+
1457
+ Supported streams:
1458
+ - pages, all_pages: Uses Search API with last_edited_time filter
1459
+ - databases, all_databases: Uses Search API to find updated data_sources
1460
+ - blocks, all_blocks_markdown: First finds updated pages, then fetches their blocks
1461
+ """
1462
+ stream = self.config.stream
1463
+
1464
+ if stream in [NotionStreams.PAGES, NotionStreams.ALL_PAGES]:
1465
+ return self.get_pages_after(source_state, pagination)
1466
+ elif stream in [NotionStreams.DATABASES, NotionStreams.ALL_DATABASES]:
1467
+ return self.get_databases_after(source_state, pagination)
1468
+ elif stream == NotionStreams.BLOCKS:
1469
+ return self.get_blocks_after(source_state, pagination)
1470
+ elif stream in [NotionStreams.BLOCKS_MARKDOWN, NotionStreams.ALL_BLOCKS_MARKDOWN]:
1471
+ return self.get_blocks_markdown_after(source_state, pagination)
1472
+ else:
1473
+ # For streams that don't support incremental, fall back to full refresh
1474
+ logger.warning(f"Stream {stream} does not support incremental sync, falling back to full refresh")
1475
+ return self.get(pagination)
1476
+
1135
1477
  # ==================== MAIN DISPATCH ====================
1136
1478
 
1137
1479
  def get(self, pagination: dict = None) -> SourceIteration:
@@ -14,7 +14,9 @@ from bizon.common.models import BizonConfig
14
14
  from bizon.engine.backend.backend import AbstractBackend
15
15
  from bizon.engine.backend.models import CursorStatus
16
16
  from bizon.engine.queue.queue import AbstractQueue
17
+ from bizon.source.config import SourceSyncModes
17
18
  from bizon.source.cursor import Cursor
19
+ from bizon.source.models import SourceIncrementalState
18
20
  from bizon.source.source import AbstractSource
19
21
 
20
22
  from .models import PipelineReturnStatus
@@ -130,6 +132,37 @@ class Producer:
130
132
  self.queue.terminate(iteration=0)
131
133
  return PipelineReturnStatus.BACKEND_ERROR
132
134
 
135
+ # Handle incremental sync mode
136
+ source_incremental_state = None
137
+ is_incremental = self.bizon_config.source.sync_mode == SourceSyncModes.INCREMENTAL
138
+
139
+ if is_incremental:
140
+ # Get the last successful job to determine last_run timestamp
141
+ last_successful_job = self.backend.get_last_successful_stream_job(
142
+ name=self.bizon_config.name,
143
+ source_name=self.bizon_config.source.name,
144
+ stream_name=self.bizon_config.source.stream,
145
+ )
146
+
147
+ if last_successful_job:
148
+ # Create incremental state with last_run from previous job
149
+ source_incremental_state = SourceIncrementalState(
150
+ last_run=last_successful_job.created_at,
151
+ state={},
152
+ cursor_field=self.bizon_config.source.cursor_field,
153
+ )
154
+ logger.info(
155
+ f"Incremental sync: fetching records after {source_incremental_state.last_run} "
156
+ f"using cursor_field: {source_incremental_state.cursor_field}"
157
+ )
158
+ else:
159
+ # First incremental run - fall back to full refresh behavior
160
+ logger.info(
161
+ "Incremental sync: No previous successful job found. "
162
+ "Falling back to full refresh behavior for first run."
163
+ )
164
+ is_incremental = False
165
+
133
166
  while not cursor.is_finished:
134
167
  if stop_event.is_set():
135
168
  logger.info("Stop event is set, terminating producer ...")
@@ -180,7 +213,15 @@ class Producer:
180
213
 
181
214
  # Get the next data
182
215
  try:
183
- source_iteration = self.source.get(pagination=cursor.pagination)
216
+ if is_incremental and source_incremental_state:
217
+ # Use incremental fetching with get_records_after
218
+ source_iteration = self.source.get_records_after(
219
+ source_state=source_incremental_state,
220
+ pagination=cursor.pagination,
221
+ )
222
+ else:
223
+ # Use standard fetching with get
224
+ source_iteration = self.source.get(pagination=cursor.pagination)
184
225
  except Exception as e:
185
226
  logger.error(traceback.format_exc())
186
227
  logger.error(
bizon/source/config.py CHANGED
@@ -42,6 +42,12 @@ class SourceConfig(BaseModel, ABC):
42
42
  default=SourceSyncModes.FULL_REFRESH,
43
43
  )
44
44
 
45
+ cursor_field: Optional[str] = Field(
46
+ default=None,
47
+ description="Field name to use for incremental filtering (e.g., 'updated_at', 'modified_at'). "
48
+ "Source will fetch records where this field > last_run timestamp.",
49
+ )
50
+
45
51
  force_ignore_checkpoint: bool = Field(
46
52
  description="Whether to force recreate the sync from iteration 0. Existing checkpoints will be ignored.",
47
53
  default=False,
bizon/source/models.py CHANGED
@@ -44,4 +44,5 @@ class SourceIteration(BaseModel):
44
44
 
45
45
  class SourceIncrementalState(BaseModel):
46
46
  last_run: datetime = Field(..., description="Timestamp of the last successful run")
47
- state: dict = Field(..., description="Incremental state information from the latest sync")
47
+ state: dict = Field(default_factory=dict, description="Incremental state information from the latest sync")
48
+ cursor_field: Optional[str] = Field(default=None, description="The field name to filter records by timestamp")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bizon
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author-email: Antoine Balliet <antoine.balliet@gmail.com>, Anas El Mhamdi <anas.elmhamdi@gmail.com>
6
6
  License-File: LICENSE
@@ -153,6 +153,130 @@ Runner is the interface used by Bizon to run the pipeline. It can be configured
153
153
  - `process` (asynchronous)
154
154
  - `stream` (synchronous)
155
155
 
156
+ ## Sync Modes
157
+
158
+ Bizon supports three sync modes:
159
+ - `full_refresh`: Re-syncs all data from scratch on each run
160
+ - `incremental`: Syncs only new/updated data since the last successful run
161
+ - `stream`: Continuous streaming mode for real-time data (e.g., Kafka)
162
+
163
+ ### Incremental Sync
164
+
165
+ Incremental sync fetches only new or updated records since the last successful run, using an **append-only** strategy.
166
+
167
+ #### Configuration
168
+
169
+ ```yaml
170
+ source:
171
+ name: your_source
172
+ stream: your_stream
173
+ sync_mode: incremental
174
+ cursor_field: updated_at # The timestamp field to filter records by
175
+ ```
176
+
177
+ #### How It Works
178
+
179
+ ```
180
+ ┌─────────────────────────────────────────────────────────────────────┐
181
+ │ INCREMENTAL SYNC FLOW │
182
+ ├─────────────────────────────────────────────────────────────────────┤
183
+ │ │
184
+ │ 1. Producer checks for last successful job │
185
+ │ └─> Backend.get_last_successful_stream_job() │
186
+ │ │
187
+ │ 2. If found, creates SourceIncrementalState: │
188
+ │ └─> last_run = previous_job.created_at │
189
+ │ └─> cursor_field = config.cursor_field (e.g., "updated_at") │
190
+ │ │
191
+ │ 3. Calls source.get_records_after(source_state, pagination) │
192
+ │ └─> Source filters: WHERE cursor_field > last_run │
193
+ │ │
194
+ │ 4. Records written to temp table: {table}_incremental │
195
+ │ │
196
+ │ 5. finalize() appends temp table to main table │
197
+ │ └─> INSERT INTO main_table SELECT * FROM temp_table │
198
+ │ └─> Deletes temp table │
199
+ │ │
200
+ │ FIRST RUN: No previous job → falls back to get() (full refresh) │
201
+ │ │
202
+ └─────────────────────────────────────────────────────────────────────┘
203
+ ```
204
+
205
+ #### Configuration Options
206
+
207
+ | Option | Required | Description | Example |
208
+ |--------|----------|-------------|---------|
209
+ | `sync_mode` | Yes | Set to `incremental` | `incremental` |
210
+ | `cursor_field` | Yes | Timestamp field to filter by | `updated_at`, `last_edited_time`, `modified_at` |
211
+
212
+ #### Supported Sources
213
+
214
+ Sources must implement `get_records_after()` to support incremental sync:
215
+
216
+ | Source | Cursor Field | Notes |
217
+ |--------|--------------|-------|
218
+ | `notion` | `last_edited_time` | Supports `pages`, `databases`, `blocks`, `blocks_markdown` streams |
219
+ | (others) | Varies | Check source docs or implement `get_records_after()` |
220
+
221
+ #### Supported Destinations
222
+
223
+ Destinations must implement `finalize()` with incremental logic:
224
+
225
+ | Destination | Support | Notes |
226
+ |-------------|---------|-------|
227
+ | `bigquery` | ✅ | Append-only via temp table |
228
+ | `bigquery_streaming_v2` | ✅ | Append-only via temp table |
229
+ | `file` | ✅ | Appends to existing file |
230
+ | `logger` | ✅ | Logs completion |
231
+
232
+ #### Example: Notion Incremental Sync
233
+
234
+ ```yaml
235
+ name: notion_incremental_sync
236
+
237
+ source:
238
+ name: notion
239
+ stream: blocks_markdown
240
+ sync_mode: incremental
241
+ cursor_field: last_edited_time
242
+ authentication:
243
+ type: api_key
244
+ params:
245
+ token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
246
+
247
+ database_ids:
248
+ - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
249
+
250
+ # Optional: filter which pages to sync
251
+ database_filters:
252
+ "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx":
253
+ property: "Status"
254
+ select:
255
+ equals: "Published"
256
+
257
+ destination:
258
+ name: bigquery
259
+ config:
260
+ project_id: my-gcp-project
261
+ dataset_id: notion_data
262
+ dataset_location: US
263
+
264
+ engine:
265
+ backend:
266
+ type: bigquery
267
+ database: my-gcp-project
268
+ schema: bizon_backend
269
+ syncCursorInDBEvery: 2
270
+ ```
271
+
272
+ #### First Run Behavior
273
+
274
+ On the first incremental run (no previous successful job):
275
+ - Falls back to `get()` method (full refresh behavior)
276
+ - All data is fetched and loaded
277
+ - Job is marked as successful
278
+ - Subsequent runs use `get_records_after()` with `last_run` timestamp
279
+
156
280
  ## Start syncing your data 🚀
157
281
 
158
282
  ### Quick setup without any dependencies ✌️
@@ -13,21 +13,24 @@ bizon/common/models.py,sha256=eL_Ii0CkeJFIjak1CKrB74mbC3OkmWP2uI27ynlYgkQ,10070
13
13
  bizon/common/errors/backoff.py,sha256=z7RkQt1Npdh0sfD3hBDaiWQKe4iqS6ewvT1Q4Fds5aU,508
14
14
  bizon/common/errors/errors.py,sha256=mrYx1uE2kOuR2pEaB7ztK1l2m0E4V-_-hxq-DuILerY,682
15
15
  bizon/connectors/destinations/bigquery/config/bigquery.example.yml,sha256=sy5-Piew00BlcjX5CFayFVrUq9G_vFYWXDmpWi9beTY,1263
16
+ bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml,sha256=z0pz4W1x0dlsoAjorYR2DxMjkzTvIWn9tigqtOR8PUY,1076
16
17
  bizon/connectors/destinations/bigquery/src/config.py,sha256=q55zR_9V5-ZZmOmSK7fDOHSzzYhoT-fwlppDzX4he9U,4000
17
- bizon/connectors/destinations/bigquery/src/destination.py,sha256=wJHT9KO5aA8sLEqgVbb9aVCXZ51_5ccgkPtTCXuBp6s,9503
18
+ bizon/connectors/destinations/bigquery/src/destination.py,sha256=awS3dZsSKqLTVnhBKuP_9rXSt3IpGv3c4WjZOCwqu9o,9888
18
19
  bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml,sha256=rF0mQ5IaOe6oqsbVy6q0innn7SXsOoBdBvIN8BTwPVc,1869
19
20
  bizon/connectors/destinations/bigquery_streaming/src/config.py,sha256=LdBKEqHPaGll8PW6c6q_lH7PJvsGdtv2BCrtB-TukTA,1898
20
- bizon/connectors/destinations/bigquery_streaming/src/destination.py,sha256=6PLO0zMbPskwtaeKfnOvu5Ls0Z-gl11uXHMCPIEdHmc,16043
21
+ bizon/connectors/destinations/bigquery_streaming/src/destination.py,sha256=Uyne57NoT-z9uk7Yi4EgOUFYQ4QlvXDLFxgZC5KyCFE,14222
21
22
  bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml,sha256=hIQXlXtiBT8DgMVAs0x_h-19xoLkjHr-Ko7oSn8jnc0,2023
22
23
  bizon/connectors/destinations/bigquery_streaming_v2/src/config.py,sha256=cdHST5Vx1VQbLsIVsPkoEtOJKmbA35XjsKzj6fZ5DHw,1907
23
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py,sha256=GSByVunYPXqaVbPbQGDKJX3b4ngUenHbvdJKIlb95a8,18680
24
+ bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py,sha256=5aXEsbzyWKzS2F1pFMZ8pdbJaXmdGTaIrwgl2cd1IbU,19026
24
25
  bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py,sha256=aWYVzMPMTgsdDapYniu8h6Tf2Pty4fDisT_33d9yEJ4,3692
25
26
  bizon/connectors/destinations/file/config/file.example.yml,sha256=sMeX92hTrTQUrLmQgQFsq5OdG5Dk3BbpDo0NhRbBahI,986
27
+ bizon/connectors/destinations/file/config/file_incremental.example.yml,sha256=Xh5KwWiQRuq_MnMgOCHiHqIwHjOjXbwQlVlVcKdXARA,620
26
28
  bizon/connectors/destinations/file/src/config.py,sha256=dU64aFe7J63aBGh6Os8mXl2kvECj3s4pPC7H3EmOvb8,585
27
- bizon/connectors/destinations/file/src/destination.py,sha256=iVmFjLjjuGRD6jbivOUqSlYxtIIMz9buB5fvXpYMsYA,1827
29
+ bizon/connectors/destinations/file/src/destination.py,sha256=RQEL0Z5l409S319fAJyvW8cDblUCVAxPhALJVhjQKDM,4253
28
30
  bizon/connectors/destinations/logger/config/logger.example.yml,sha256=KtQRmqqFeziJtBZ7vzrXGQLdTgWZNjxx2sdFXpIgIp4,672
31
+ bizon/connectors/destinations/logger/config/logger_incremental.example.yml,sha256=rwTLlXib-Jo3b4-_NcFv2ShdPC73WEpiiX3apP3sKg0,541
29
32
  bizon/connectors/destinations/logger/src/config.py,sha256=vIV_G0k9c8DPcDxU6CGvEOL2zAEvAmKZcx3RV0eRi7A,426
30
- bizon/connectors/destinations/logger/src/destination.py,sha256=-KosqybNiJq3-mlrrxa0cSUdwmaDcelfeRQcytbfjBQ,1226
33
+ bizon/connectors/destinations/logger/src/destination.py,sha256=YUC_lAN5nrcrNAN90hnalKFAKX49KTDlJwdLfwTaC0U,2007
31
34
  bizon/connectors/sources/cycle/config/cycle.example.yml,sha256=UDiqOa-8ZsykmNT625kxq9tyXOj_gKe9CFwg9r_8SYk,230
32
35
  bizon/connectors/sources/cycle/src/source.py,sha256=6sXMneq59XZAT5oJseM9k6sGJaoQw4NDp8FTtg8lPhk,4213
33
36
  bizon/connectors/sources/cycle/tests/cycle_customers.py,sha256=A48S20LxIC0A74JLoFn4NTHNTgBWV_5stTFtF1Gfk2c,271
@@ -43,9 +46,11 @@ bizon/connectors/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py,sha25
43
46
  bizon/connectors/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py,sha256=PFUhDuFw1Q1AMNMsnXPQxoqHIWf_wHEL1hLQodYlLcQ,596
44
47
  bizon/connectors/sources/gsheets/config/default_auth.example.yml,sha256=KOBp6MfO4uJwpwEYW0tJ4X5ctVwwdur9poJB4Ohba6s,348
45
48
  bizon/connectors/sources/gsheets/config/service_account.example.yml,sha256=XxVUnk9gGWc3lDb8CnzTHjTu8xz4Asyr5tXzY6qLvPg,1081
49
+ bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml,sha256=WGvAtw4aOwSMWrSZW0tHaRncZnGbI6gd4LJk1aHIP_c,1765
46
50
  bizon/connectors/sources/gsheets/src/source.py,sha256=xNF5FR9QLTM4kCiZ2eKZ5CZWNhLw6tyLaJZbliNzYnY,5675
47
51
  bizon/connectors/sources/gsheets/tests/gsheets_pipeline.py,sha256=lNSM3kZTd4W_-ajGIO3mdp8qGdEbnmWqsMm5pRiS0cw,181
48
52
  bizon/connectors/sources/hubspot/config/api_key.example.yml,sha256=VDTRloE5caqAdGdXgvsJZ6nQT46JHzX_YboxeGbpP18,389
53
+ bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml,sha256=g4SBeVEXSr3tCgy5VjgZPWkhnuvEZ0jl5nPNn3u05Jc,920
49
54
  bizon/connectors/sources/hubspot/config/oauth.example.yml,sha256=YqBtj1IxIsdM9E85_4eVWl6mPiHsQNoQn41EzCqORy0,499
50
55
  bizon/connectors/sources/hubspot/src/hubspot_base.py,sha256=THo8ImrPrIxeTuFcBMRJYwaDMstIfLIGjrQLE2cqqsU,3424
51
56
  bizon/connectors/sources/hubspot/src/hubspot_objects.py,sha256=ykqvxaFihv0e0A3-gGDmentp1KCGCoYvvDwZ3CcHzNg,6301
@@ -60,9 +65,10 @@ bizon/connectors/sources/kafka/src/decode.py,sha256=RhPjazRQHb72D9iBhb763Nje7SH9
60
65
  bizon/connectors/sources/kafka/src/source.py,sha256=0Hv6viyVZGAd4azhQnqCteyHuwsbbDL4rSGEjMCff9E,19722
61
66
  bizon/connectors/sources/kafka/tests/kafka_pipeline.py,sha256=9LaCqXJIEx2ye3dkWq0YK_bPX7d4fCX_OcDOJCk34WE,206
62
67
  bizon/connectors/sources/notion/config/api_key.example.yml,sha256=TagqOqaho4u_G5ZP4L8je89Y4G_NvCo8s4Wf9e8yVH8,1061
68
+ bizon/connectors/sources/notion/config/api_key_incremental.example.yml,sha256=52uQJo-SrqFny00zIVbA86qVq3asYHMFALqBcdmPmc8,1499
63
69
  bizon/connectors/sources/notion/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
70
  bizon/connectors/sources/notion/src/config.py,sha256=L-FZWijUa-aWK9VenWGsl6mv40i4ww46FacjYoX9gXo,1886
65
- bizon/connectors/sources/notion/src/source.py,sha256=QG0z6uCRpIKa-BI7NfLUubb_p7_-z6WSWr8gpjWNHfY,50548
71
+ bizon/connectors/sources/notion/src/source.py,sha256=aViwfLuBzsNGZHwU4-z-xI40cROJTvx7Tlkw3ApF3q8,66217
66
72
  bizon/connectors/sources/notion/tests/notion_pipeline.py,sha256=lyiD9b5uUF3oih8vY4gk7QXnfySGSawnbrBuSdTLym8,200
67
73
  bizon/connectors/sources/notion/tests/test_notion.py,sha256=-G0DbTLDS2Gc_Bx8xR2VXnY89vW64s1-puwPc9x2N7A,4029
68
74
  bizon/connectors/sources/periscope/config/periscope_charts.example.yml,sha256=9OgFDB7vguiNz2F2fmRqDNV8S_ddO9ncN5hgW9MhME4,350
@@ -88,7 +94,7 @@ bizon/engine/backend/adapters/sqlalchemy/backend.py,sha256=ipJ7eY_iiqjrvtq4NS39C
88
94
  bizon/engine/backend/adapters/sqlalchemy/config.py,sha256=CeTWncVK27Y6lEKMVCF5RxD8Illhx2IQqqFkGrf0WKA,1845
89
95
  bizon/engine/pipeline/consumer.py,sha256=DtCR3mG791h35poYJdXjL9geNO-GWPKl_YC0zPsF5qI,3207
90
96
  bizon/engine/pipeline/models.py,sha256=qOra2MJGN6-PuouKpKuZRjutnQmzom0mgWDFZ16LcM8,405
91
- bizon/engine/pipeline/producer.py,sha256=8e7cKcZh3_Irz4ceb3NzIDD8X915U26eGONqgNiYpKQ,10017
97
+ bizon/engine/pipeline/producer.py,sha256=XV2fR6CNMRlbYwqTl9mlqy6nkG37ODyh2aiiTZ371VM,11995
92
98
  bizon/engine/queue/config.py,sha256=0XwiQSB2OKTs-rODCSZqT5txNZzGOic2-PvODbcSrGg,1267
93
99
  bizon/engine/queue/queue.py,sha256=Y9uj31d-ZgW2f0F02iccp_o-m-RoMm_jR61NkLdMQ2M,3461
94
100
  bizon/engine/queue/adapters/kafka/config.py,sha256=ndNEXRT-nIgyWgoqlNXFhmlN206v87GobXIW9Z0zrSA,1085
@@ -113,10 +119,10 @@ bizon/monitoring/datadog/monitor.py,sha256=YSdyMVEIjkDyp91_mGED_kx8j76MbQyQGkGJC
113
119
  bizon/monitoring/noop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
120
  bizon/monitoring/noop/monitor.py,sha256=Pu7Qt9SpUG1UvC8aWysgtoDY-t5tnKd4FlUXAC4MjbI,1066
115
121
  bizon/source/callback.py,sha256=lfTwU_bzJwR0q5sbiKoK8uedQ-dhfHzoYkPVqm8b_Ho,602
116
- bizon/source/config.py,sha256=0wQiX8VJJq9w0hALJkqIxMH4Wa760LhsieCy2VmUcfo,2223
122
+ bizon/source/config.py,sha256=JyZbKjlU0xhiyuuIGJYJPGUl9JxS4xyGeCyHoHgHHos,2473
117
123
  bizon/source/cursor.py,sha256=Wjh9eNEiHV5P9YnjS5bdS2ahyFc0gPm9QLQtD-QjQCI,4089
118
124
  bizon/source/discover.py,sha256=h9IVqtAQsTH-XxR-UkAFgNvEphLP2LgataQCCuHbGrk,11174
119
- bizon/source/models.py,sha256=cncnIgZF-kQM0e08trvNgOHQ6AnQK8ko3GzgOGuWwgE,1705
125
+ bizon/source/models.py,sha256=CHPKvO9chRi85WPDfLYy9vWnPsua8LTwYvjjN7Dj2uA,1837
120
126
  bizon/source/session.py,sha256=klbCv0g6sm6ac-pzM50eAJSP8DdQ9DOegHgjpmKKUrI,1978
121
127
  bizon/source/source.py,sha256=k_fHOOvam5ixZ9oPuQzUa9Kq3jVvv2HY7ghrCo-0o3I,4342
122
128
  bizon/source/auth/builder.py,sha256=hc4zBNj31LZc-QqgIyx1VQEYTm9Xv81vY5pJiwQroJo,860
@@ -129,8 +135,8 @@ bizon/source/auth/authenticators/oauth.py,sha256=tY_UZsWTy4FkifqJ7-smPaD61gg1dMJ
129
135
  bizon/source/auth/authenticators/token.py,sha256=P6SKRAarAEv28YiWp8hQLSKAV7twNlyNTGRr9sxlx58,956
130
136
  bizon/transform/config.py,sha256=Q9F7jlsuaXK8OYrO5qcdk8lxXTDoIgzoVMhhHW3igEw,213
131
137
  bizon/transform/transform.py,sha256=Ufla8YFx9C9WEiN0ppmZS1a86Sk0PgggqC-8DIvDeAQ,1414
132
- bizon-0.2.0.dist-info/METADATA,sha256=ArxHLLJlMkVTo6mYoaeiLco_inGdcjPivIB9gOki-QA,6322
133
- bizon-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
134
- bizon-0.2.0.dist-info/entry_points.txt,sha256=hHZPN-V6JwwhSYWNCKVu3WNxekuhXtIAaz_zdwO7NDo,45
135
- bizon-0.2.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
136
- bizon-0.2.0.dist-info/RECORD,,
138
+ bizon-0.3.0.dist-info/METADATA,sha256=oX7OZjHhKAVvQ8UiRS0ksqu3C65t2kOp2mAfXoEBdJY,11159
139
+ bizon-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
140
+ bizon-0.3.0.dist-info/entry_points.txt,sha256=hHZPN-V6JwwhSYWNCKVu3WNxekuhXtIAaz_zdwO7NDo,45
141
+ bizon-0.3.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
142
+ bizon-0.3.0.dist-info/RECORD,,
File without changes