bizon 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bizon/cli/main.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import click
2
+ from dotenv import find_dotenv, load_dotenv
2
3
 
3
4
  from bizon.engine.engine import RunnerFactory
4
5
  from bizon.engine.runner.config import LoggerLevel
@@ -95,15 +96,28 @@ def destination():
95
96
  show_default=True,
96
97
  help="Log level to use.",
97
98
  )
99
+ @click.option(
100
+ "--env-file",
101
+ required=False,
102
+ type=click.Path(exists=True),
103
+ help="Path to .env file to load environment variables from.",
104
+ )
98
105
  def run(
99
106
  filename: str,
100
107
  custom_source: str,
101
108
  runner: str,
102
109
  log_level: LoggerLevel,
110
+ env_file: str,
103
111
  help="Run a bizon pipeline from a YAML file.",
104
112
  ):
105
113
  """Run a bizon pipeline from a YAML file."""
106
114
 
115
+ # Load environment variables from .env file
116
+ if env_file:
117
+ load_dotenv(env_file)
118
+ else:
119
+ load_dotenv(find_dotenv(".env"))
120
+
107
121
  # Parse config from YAML file as a dictionary
108
122
  config = parse_from_yaml(filename)
109
123
 
@@ -0,0 +1,34 @@
1
+ name: hubspot contacts to bigquery (incremental)
2
+
3
+ source:
4
+ name: hubspot
5
+ stream: contacts
6
+ sync_mode: incremental
7
+ cursor_field: updatedAt # HubSpot's timestamp field for filtering
8
+ properties:
9
+ strategy: all
10
+ authentication:
11
+ type: api_key
12
+ api_key: <MY_API_KEY>
13
+
14
+ destination:
15
+ # Authentication: If empty it will be infered.
16
+ # Must have the bigquery.jobUser
17
+ # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
18
+ name: bigquery
19
+ config:
20
+ buffer_size: 10 # in Mb
21
+ buffer_flush_timeout: 300 # in seconds
22
+ dataset_id: bizon_test
23
+ dataset_location: US
24
+ project_id: my-gcp-project-id
25
+ gcs_buffer_bucket: bizon-buffer
26
+ gcs_buffer_format: parquet
27
+ # Optional: service_account_key for explicit authentication
28
+ # service_account_key: >-
29
+ # { ... }
30
+
31
+ # How incremental sync works:
32
+ # 1. First run: Behaves like full_refresh (fetches all data)
33
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
34
+ # 3. Uses append-only strategy - new records are appended to existing data
@@ -210,7 +210,11 @@ class BigQueryDestination(AbstractDestination):
210
210
  return True
211
211
 
212
212
  elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
213
- # TO DO: Implement incremental sync
213
+ # Append data from incremental temp table to main table
214
+ logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
215
+ self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
216
+ logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
217
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
214
218
  return True
215
219
 
216
220
  elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import tempfile
3
- from datetime import datetime
4
3
  from typing import List, Tuple
5
4
 
6
5
  import orjson
@@ -162,39 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
162
161
  response = write_client.append_rows(iter([request]))
163
162
  return response.code().name
164
163
 
165
- def safe_cast_record_values(self, row: dict):
166
- """
167
- Safe cast record values to the correct type for BigQuery.
168
- """
169
- for col in self.record_schemas[self.destination_id]:
170
- # Handle dicts as strings
171
- if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
172
- if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
173
- row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
174
-
175
- # Handle timestamps
176
- if (
177
- col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
178
- and col.default_value_expression is None
179
- ):
180
- if isinstance(row[col.name], int):
181
- if row[col.name] > datetime(9999, 12, 31).timestamp():
182
- row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
183
- "%Y-%m-%d %H:%M:%S.%f"
184
- )
185
- else:
186
- try:
187
- row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
188
- except ValueError:
189
- error_message = (
190
- f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
191
- f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
192
- "Consider using a transformation."
193
- )
194
- logger.error(error_message)
195
- raise ValueError(error_message)
196
- return row
197
-
198
164
  @retry(
199
165
  retry=retry_if_exception_type(
200
166
  (
@@ -281,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
281
247
 
282
248
  if self.config.unnest:
283
249
  # We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
284
- rows_to_insert = [
285
- self.safe_cast_record_values(orjson.loads(row))
286
- for row in df_destination_records["source_data"].to_list()
287
- ]
250
+ rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
288
251
  else:
289
252
  df_destination_records = df_destination_records.with_columns(
290
253
  pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
@@ -1,7 +1,6 @@
1
1
  import os
2
2
  import tempfile
3
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
4
- from datetime import datetime
5
4
  from typing import List, Tuple, Type
6
5
 
7
6
  import orjson
@@ -40,6 +39,7 @@ from bizon.destination.destination import AbstractDestination
40
39
  from bizon.engine.backend.backend import AbstractBackend
41
40
  from bizon.monitoring.monitor import AbstractMonitor
42
41
  from bizon.source.callback import AbstractSourceCallback
42
+ from bizon.source.config import SourceSyncModes
43
43
 
44
44
  from .config import BigQueryStreamingV2ConfigDetails
45
45
  from .proto_utils import get_proto_schema_and_class
@@ -81,6 +81,17 @@ class BigQueryStreamingV2Destination(AbstractDestination):
81
81
  tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
82
82
  return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
83
83
 
84
+ @property
85
+ def temp_table_id(self) -> str:
86
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
87
+ return f"{self.table_id}_temp"
88
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
89
+ return f"{self.table_id}_incremental"
90
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
91
+ return f"{self.table_id}"
92
+ # Default fallback
93
+ return f"{self.table_id}"
94
+
84
95
  def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
85
96
  if self.config.unnest:
86
97
  if len(list(self.record_schemas.keys())) == 1:
@@ -165,36 +176,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
165
176
  logger.error(f"Stream name: {stream_name}")
166
177
  raise
167
178
 
168
- def safe_cast_record_values(self, row: dict):
169
- """
170
- Safe cast record values to the correct type for BigQuery.
171
- """
172
- for col in self.record_schemas[self.destination_id]:
173
- # Handle dicts as strings
174
- if col.type in ["STRING", "JSON"]:
175
- if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
176
- row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
177
-
178
- # Handle timestamps
179
- if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
180
- if isinstance(row[col.name], int):
181
- if row[col.name] > datetime(9999, 12, 31).timestamp():
182
- row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
183
- "%Y-%m-%d %H:%M:%S.%f"
184
- )
185
- else:
186
- try:
187
- row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
188
- except ValueError:
189
- error_message = (
190
- f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
191
- f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
192
- "Consider using a transformation."
193
- )
194
- logger.error(error_message)
195
- raise ValueError(error_message)
196
- return row
197
-
198
179
  @staticmethod
199
180
  def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
200
181
  """Convert a row to a Protobuf serialization."""
@@ -263,14 +244,14 @@ class BigQueryStreamingV2Destination(AbstractDestination):
263
244
  deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
264
245
  deserialized_rows.append(deserialized_row)
265
246
 
266
- # For large rows, we need to use the main client
247
+ # For large rows, we need to use the main client (write to temp_table_id)
267
248
  job_config = bigquery.LoadJobConfig(
268
249
  source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
269
- schema=self.bq_client.get_table(self.table_id).schema,
250
+ schema=self.bq_client.get_table(self.temp_table_id).schema,
270
251
  ignore_unknown_values=True,
271
252
  )
272
253
  load_job = self.bq_client.load_table_from_json(
273
- deserialized_rows, self.table_id, job_config=job_config, timeout=300
254
+ deserialized_rows, self.temp_table_id, job_config=job_config, timeout=300
274
255
  )
275
256
  result = load_job.result()
276
257
  if load_job.state != "DONE":
@@ -292,9 +273,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
292
273
  raise
293
274
 
294
275
  def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
295
- # Create table if it does not exist
276
+ # Create table if it does not exist (use temp_table_id for staging)
296
277
  schema = self.get_bigquery_schema()
297
- table = bigquery.Table(self.table_id, schema=schema)
278
+ table = bigquery.Table(self.temp_table_id, schema=schema)
298
279
  time_partitioning = TimePartitioning(
299
280
  field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
300
281
  )
@@ -305,7 +286,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
305
286
  try:
306
287
  table = self.bq_client.create_table(table)
307
288
  except Conflict:
308
- table = self.bq_client.get_table(self.table_id)
289
+ table = self.bq_client.get_table(self.temp_table_id)
309
290
  # Compare and update schema if needed
310
291
  existing_fields = {field.name: field for field in table.schema}
311
292
  new_fields = {field.name: field for field in self.get_bigquery_schema()}
@@ -319,12 +300,13 @@ class BigQueryStreamingV2Destination(AbstractDestination):
319
300
  table.schema = updated_schema
320
301
  table = self.bq_client.update_table(table, ["schema"])
321
302
 
322
- # Create the stream
323
- if self.destination_id:
324
- project, dataset, table_name = self.destination_id.split(".")
303
+ # Create the stream (use temp_table_id for staging)
304
+ temp_table_parts = self.temp_table_id.split(".")
305
+ if len(temp_table_parts) == 3:
306
+ project, dataset, table_name = temp_table_parts
325
307
  parent = BigQueryWriteClient.table_path(project, dataset, table_name)
326
308
  else:
327
- parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
309
+ parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, temp_table_parts[-1])
328
310
 
329
311
  stream_name = f"{parent}/_default"
330
312
 
@@ -333,9 +315,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
333
315
 
334
316
  if self.config.unnest:
335
317
  serialized_rows = [
336
- self.to_protobuf_serialization(
337
- TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
338
- )
318
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=orjson.loads(row))
339
319
  for row in df_destination_records["source_data"].to_list()
340
320
  ]
341
321
  else:
@@ -442,3 +422,29 @@ class BigQueryStreamingV2Destination(AbstractDestination):
442
422
  if large_rows:
443
423
  logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
444
424
  yield {"stream_batch": current_batch, "json_batch": large_rows}
425
+
426
+ def finalize(self):
427
+ """Finalize the sync by moving data from temp table to main table based on sync mode."""
428
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
429
+ # Replace main table with temp table data
430
+ logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
431
+ self.bq_client.query(
432
+ f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
433
+ ).result()
434
+ logger.info(f"Deleting temp table {self.temp_table_id} ...")
435
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
436
+ return True
437
+
438
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
439
+ # Append data from incremental temp table to main table
440
+ logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
441
+ self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
442
+ logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
443
+ self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
444
+ return True
445
+
446
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
447
+ # Direct writes, no finalization needed
448
+ return True
449
+
450
+ return True
@@ -0,0 +1,22 @@
1
+ name: dummy to file (incremental)
2
+
3
+ source:
4
+ name: dummy
5
+ stream: creatures
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Field to filter records by timestamp
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: dummy_key
12
+
13
+ destination:
14
+ name: file
15
+ config:
16
+ format: json
17
+
18
+ # How incremental sync works with file destination:
19
+ # 1. First run: Behaves like full_refresh (creates new file)
20
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
21
+ # 3. New records are appended to the existing JSON file
22
+ # 4. Writes to temp file (_incremental.json) then appends to main file on finalize
@@ -1,13 +1,17 @@
1
+ import os
2
+ import shutil
1
3
  from typing import Tuple
2
4
 
3
5
  import orjson
4
6
  import polars as pl
7
+ from loguru import logger
5
8
 
6
9
  from bizon.common.models import SyncMetadata
7
10
  from bizon.destination.destination import AbstractDestination
8
11
  from bizon.engine.backend.backend import AbstractBackend
9
12
  from bizon.monitoring.monitor import AbstractMonitor
10
13
  from bizon.source.callback import AbstractSourceCallback
14
+ from bizon.source.config import SourceSyncModes
11
15
 
12
16
  from .config import FileDestinationDetailsConfig
13
17
 
@@ -24,6 +28,30 @@ class FileDestination(AbstractDestination):
24
28
  super().__init__(sync_metadata, config, backend, source_callback, monitor)
25
29
  self.config: FileDestinationDetailsConfig = config
26
30
 
31
+ @property
32
+ def file_path(self) -> str:
33
+ """Main output file path."""
34
+ return f"{self.destination_id}.json"
35
+
36
+ @property
37
+ def temp_file_path(self) -> str:
38
+ """Temp file path for FULL_REFRESH mode."""
39
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
40
+ return f"{self.destination_id}_temp.json"
41
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
42
+ return f"{self.destination_id}_incremental.json"
43
+ return self.file_path
44
+
45
+ @property
46
+ def write_path(self) -> str:
47
+ """Get the path to write to based on sync mode."""
48
+ if self.sync_metadata.sync_mode in [
49
+ SourceSyncModes.FULL_REFRESH.value,
50
+ SourceSyncModes.INCREMENTAL.value,
51
+ ]:
52
+ return self.temp_file_path
53
+ return self.file_path
54
+
27
55
  def check_connection(self) -> bool:
28
56
  return True
29
57
 
@@ -34,7 +62,7 @@ class FileDestination(AbstractDestination):
34
62
  if self.config.unnest:
35
63
  schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
36
64
 
37
- with open(f"{self.destination_id}.json", "a") as f:
65
+ with open(self.write_path, "a") as f:
38
66
  for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
39
67
  assert set(value.keys()) == schema_keys, "Keys do not match the schema"
40
68
 
@@ -46,6 +74,35 @@ class FileDestination(AbstractDestination):
46
74
  f.write(f"{orjson.dumps(row).decode('utf-8')}\n")
47
75
 
48
76
  else:
49
- df_destination_records.write_ndjson(f"{self.destination_id}.json")
77
+ # Append mode for incremental, overwrite for full refresh on first write
78
+ with open(self.write_path, "a") as f:
79
+ for record in df_destination_records.iter_rows(named=True):
80
+ f.write(f"{orjson.dumps(record).decode('utf-8')}\n")
50
81
 
51
82
  return True, ""
83
+
84
+ def finalize(self) -> bool:
85
+ """Finalize the sync by moving temp file to main file based on sync mode."""
86
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
87
+ # Replace main file with temp file
88
+ if os.path.exists(self.temp_file_path):
89
+ logger.info(f"File destination: Moving {self.temp_file_path} to {self.file_path}")
90
+ shutil.move(self.temp_file_path, self.file_path)
91
+ return True
92
+
93
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
94
+ # Append temp file contents to main file
95
+ if os.path.exists(self.temp_file_path):
96
+ logger.info(f"File destination: Appending {self.temp_file_path} to {self.file_path}")
97
+ with open(self.file_path, "a") as main_file:
98
+ with open(self.temp_file_path) as temp_file:
99
+ main_file.write(temp_file.read())
100
+ os.remove(self.temp_file_path)
101
+ return True
102
+
103
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
104
+ # Direct writes, no finalization needed
105
+ logger.info("File destination: STREAM sync batch completed")
106
+ return True
107
+
108
+ return True
@@ -0,0 +1,21 @@
1
+ name: dummy to logger (incremental)
2
+
3
+ source:
4
+ name: dummy
5
+ stream: creatures
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Field to filter records by timestamp
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: dummy_key
12
+
13
+ destination:
14
+ name: logger
15
+ config:
16
+ dummy: dummy
17
+
18
+ # How incremental sync works:
19
+ # 1. First run: Behaves like full_refresh (fetches all data)
20
+ # 2. Subsequent runs: Only fetches records where cursor_field > last_run
21
+ # 3. Logger outputs records with [incremental] prefix for easy identification
@@ -8,6 +8,7 @@ from bizon.destination.destination import AbstractDestination
8
8
  from bizon.engine.backend.backend import AbstractBackend
9
9
  from bizon.monitoring.monitor import AbstractMonitor
10
10
  from bizon.source.callback import AbstractSourceCallback
11
+ from bizon.source.config import SourceSyncModes
11
12
 
12
13
  from .config import LoggerDestinationConfig
13
14
 
@@ -36,6 +37,17 @@ class LoggerDestination(AbstractDestination):
36
37
  return True
37
38
 
38
39
  def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
40
+ sync_mode_label = f"[{self.sync_metadata.sync_mode}]" if self.sync_metadata.sync_mode else ""
39
41
  for record in df_destination_records.iter_rows(named=True):
40
- logger.info(record["source_data"])
42
+ logger.info(f"{sync_mode_label} {record['source_data']}")
41
43
  return True, ""
44
+
45
+ def finalize(self) -> bool:
46
+ """Finalize the sync - logs completion message based on sync mode."""
47
+ if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
48
+ logger.info("Logger destination: FULL_REFRESH sync completed")
49
+ elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
50
+ logger.info("Logger destination: INCREMENTAL sync completed (records appended)")
51
+ elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
52
+ logger.info("Logger destination: STREAM sync batch completed")
53
+ return True
@@ -0,0 +1,51 @@
1
+ name: gsheets incremental sync
2
+
3
+ source:
4
+ name: gsheets
5
+ stream: worksheet
6
+ sync_mode: incremental
7
+ cursor_field: updated_at # Column name in your sheet containing timestamps
8
+ spreadsheet_url: <MY_SPREADSHEET_URL>
9
+ worksheet_name: Sheet1
10
+ service_account_key: >-
11
+ {
12
+ "type": "service_account",
13
+ "project_id": "<MY_GCP_PROJECT>",
14
+ "private_key_id": "xxx",
15
+ "private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
16
+ "client_email": "bizon@<MY_GCP_PROJECT>.iam.gserviceaccount.com",
17
+ "client_id": "999999999999",
18
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
19
+ "token_uri": "https://oauth2.googleapis.com/token",
20
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
21
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...",
22
+ "universe_domain": "googleapis.com"
23
+ }
24
+
25
+ destination:
26
+ name: bigquery
27
+ config:
28
+ project_id: <MY_GCP_PROJECT>
29
+ dataset_id: gsheets_data
30
+ dataset_location: US
31
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
32
+ gcs_buffer_format: parquet
33
+
34
+ engine:
35
+ backend:
36
+ type: bigquery
37
+ database: <MY_GCP_PROJECT>
38
+ schema: bizon_backend
39
+ syncCursorInDBEvery: 2
40
+
41
+ # Incremental sync for Google Sheets:
42
+ # - First run: Fetches all rows (full refresh behavior)
43
+ # - Subsequent runs: Only fetches rows where cursor_field > last_run
44
+ #
45
+ # IMPORTANT: Your Google Sheet must have a timestamp column for incremental sync.
46
+ # Common patterns:
47
+ # - Add an "updated_at" column with formula: =NOW() (updates on edit)
48
+ # - Use Google Apps Script to auto-update timestamps on row changes
49
+ # - Manually maintain a "last_modified" column
50
+ #
51
+ # If your sheet doesn't have timestamps, use sync_mode: full_refresh instead.
@@ -0,0 +1,40 @@
1
+ name: hubspot contacts incremental sync
2
+
3
+ source:
4
+ name: hubspot
5
+ stream: contacts
6
+ sync_mode: incremental
7
+ cursor_field: updatedAt # HubSpot's timestamp field for contacts
8
+ properties:
9
+ strategy: all
10
+ authentication:
11
+ type: api_key
12
+ params:
13
+ token: <MY_API_KEY>
14
+
15
+ destination:
16
+ name: bigquery
17
+ config:
18
+ project_id: <MY_GCP_PROJECT>
19
+ dataset_id: hubspot_data
20
+ dataset_location: US
21
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
22
+ gcs_buffer_format: parquet
23
+
24
+ engine:
25
+ backend:
26
+ type: bigquery
27
+ database: <MY_GCP_PROJECT>
28
+ schema: bizon_backend
29
+ syncCursorInDBEvery: 2
30
+
31
+ # Incremental sync for HubSpot:
32
+ # - First run: Fetches all contacts (full refresh behavior)
33
+ # - Subsequent runs: Only fetches contacts where updatedAt > last_run
34
+ #
35
+ # Common cursor fields by stream:
36
+ # - contacts: updatedAt
37
+ # - companies: updatedAt
38
+ # - deals: updatedAt
39
+ # - tickets: updatedAt
40
+ # - products: updatedAt
@@ -0,0 +1,48 @@
1
+ name: notion pages incremental sync
2
+
3
+ source:
4
+ name: notion
5
+ stream: pages # Options: databases, data_sources, pages, blocks, users
6
+ sync_mode: incremental
7
+ cursor_field: last_edited_time # Notion's timestamp field
8
+ authentication:
9
+ type: api_key
10
+ params:
11
+ token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
12
+
13
+ # List of database IDs to fetch data from
14
+ database_ids:
15
+ - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
16
+
17
+ # Number of results per API call (1-100, default: 100)
18
+ page_size: 100
19
+
20
+ destination:
21
+ name: bigquery
22
+ config:
23
+ project_id: <MY_GCP_PROJECT>
24
+ dataset_id: notion_data
25
+ dataset_location: US
26
+ gcs_buffer_bucket: <MY_GCS_BUCKET>
27
+ gcs_buffer_format: parquet
28
+
29
+ engine:
30
+ backend:
31
+ type: bigquery
32
+ database: <MY_GCP_PROJECT>
33
+ schema: bizon_backend
34
+ syncCursorInDBEvery: 2
35
+
36
+ # Incremental sync for Notion:
37
+ # - First run: Fetches all pages/databases (full refresh behavior)
38
+ # - Subsequent runs: Only fetches items where last_edited_time > last_run
39
+ #
40
+ # Supported streams for incremental sync:
41
+ # - pages, all_pages: Uses Search API with last_edited_time filter
42
+ # - databases, all_databases: Uses Search API to find updated data_sources
43
+ # - blocks: First finds updated pages, then fetches their blocks
44
+ # - blocks_markdown, all_blocks_markdown: Same as blocks, converts to markdown
45
+ #
46
+ # Not supported (falls back to full refresh):
47
+ # - users: No timestamp filter available
48
+ # - data_sources: Use databases stream instead