bizon 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml +34 -0
- bizon/connectors/destinations/bigquery/src/destination.py +5 -1
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +1 -38
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +50 -44
- bizon/connectors/destinations/file/config/file_incremental.example.yml +22 -0
- bizon/connectors/destinations/file/src/destination.py +59 -2
- bizon/connectors/destinations/logger/config/logger_incremental.example.yml +21 -0
- bizon/connectors/destinations/logger/src/destination.py +13 -1
- bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml +51 -0
- bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml +40 -0
- bizon/connectors/sources/notion/config/api_key_incremental.example.yml +48 -0
- bizon/connectors/sources/notion/src/source.py +343 -1
- bizon/engine/pipeline/producer.py +42 -1
- bizon/source/config.py +6 -0
- bizon/source/models.py +2 -1
- {bizon-0.2.0.dist-info → bizon-0.3.0.dist-info}/METADATA +125 -1
- {bizon-0.2.0.dist-info → bizon-0.3.0.dist-info}/RECORD +20 -14
- {bizon-0.2.0.dist-info → bizon-0.3.0.dist-info}/WHEEL +0 -0
- {bizon-0.2.0.dist-info → bizon-0.3.0.dist-info}/entry_points.txt +0 -0
- {bizon-0.2.0.dist-info → bizon-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
name: hubspot contacts to bigquery (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: hubspot
|
|
5
|
+
stream: contacts
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updatedAt # HubSpot's timestamp field for filtering
|
|
8
|
+
properties:
|
|
9
|
+
strategy: all
|
|
10
|
+
authentication:
|
|
11
|
+
type: api_key
|
|
12
|
+
api_key: <MY_API_KEY>
|
|
13
|
+
|
|
14
|
+
destination:
|
|
15
|
+
# Authentication: If empty it will be infered.
|
|
16
|
+
# Must have the bigquery.jobUser
|
|
17
|
+
# Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
|
|
18
|
+
name: bigquery
|
|
19
|
+
config:
|
|
20
|
+
buffer_size: 10 # in Mb
|
|
21
|
+
buffer_flush_timeout: 300 # in seconds
|
|
22
|
+
dataset_id: bizon_test
|
|
23
|
+
dataset_location: US
|
|
24
|
+
project_id: my-gcp-project-id
|
|
25
|
+
gcs_buffer_bucket: bizon-buffer
|
|
26
|
+
gcs_buffer_format: parquet
|
|
27
|
+
# Optional: service_account_key for explicit authentication
|
|
28
|
+
# service_account_key: >-
|
|
29
|
+
# { ... }
|
|
30
|
+
|
|
31
|
+
# How incremental sync works:
|
|
32
|
+
# 1. First run: Behaves like full_refresh (fetches all data)
|
|
33
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
34
|
+
# 3. Uses append-only strategy - new records are appended to existing data
|
|
@@ -210,7 +210,11 @@ class BigQueryDestination(AbstractDestination):
|
|
|
210
210
|
return True
|
|
211
211
|
|
|
212
212
|
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
213
|
-
#
|
|
213
|
+
# Append data from incremental temp table to main table
|
|
214
|
+
logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
|
|
215
|
+
self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
|
|
216
|
+
logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
|
|
217
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
214
218
|
return True
|
|
215
219
|
|
|
216
220
|
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from typing import List, Tuple
|
|
5
4
|
|
|
6
5
|
import orjson
|
|
@@ -162,39 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
162
161
|
response = write_client.append_rows(iter([request]))
|
|
163
162
|
return response.code().name
|
|
164
163
|
|
|
165
|
-
def safe_cast_record_values(self, row: dict):
|
|
166
|
-
"""
|
|
167
|
-
Safe cast record values to the correct type for BigQuery.
|
|
168
|
-
"""
|
|
169
|
-
for col in self.record_schemas[self.destination_id]:
|
|
170
|
-
# Handle dicts as strings
|
|
171
|
-
if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
|
|
172
|
-
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
173
|
-
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
174
|
-
|
|
175
|
-
# Handle timestamps
|
|
176
|
-
if (
|
|
177
|
-
col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
|
|
178
|
-
and col.default_value_expression is None
|
|
179
|
-
):
|
|
180
|
-
if isinstance(row[col.name], int):
|
|
181
|
-
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
182
|
-
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
183
|
-
"%Y-%m-%d %H:%M:%S.%f"
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
try:
|
|
187
|
-
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
188
|
-
except ValueError:
|
|
189
|
-
error_message = (
|
|
190
|
-
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
191
|
-
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
192
|
-
"Consider using a transformation."
|
|
193
|
-
)
|
|
194
|
-
logger.error(error_message)
|
|
195
|
-
raise ValueError(error_message)
|
|
196
|
-
return row
|
|
197
|
-
|
|
198
164
|
@retry(
|
|
199
165
|
retry=retry_if_exception_type(
|
|
200
166
|
(
|
|
@@ -281,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
281
247
|
|
|
282
248
|
if self.config.unnest:
|
|
283
249
|
# We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
|
|
284
|
-
rows_to_insert = [
|
|
285
|
-
self.safe_cast_record_values(orjson.loads(row))
|
|
286
|
-
for row in df_destination_records["source_data"].to_list()
|
|
287
|
-
]
|
|
250
|
+
rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
|
|
288
251
|
else:
|
|
289
252
|
df_destination_records = df_destination_records.with_columns(
|
|
290
253
|
pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
-
from datetime import datetime
|
|
5
4
|
from typing import List, Tuple, Type
|
|
6
5
|
|
|
7
6
|
import orjson
|
|
@@ -40,6 +39,7 @@ from bizon.destination.destination import AbstractDestination
|
|
|
40
39
|
from bizon.engine.backend.backend import AbstractBackend
|
|
41
40
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
42
41
|
from bizon.source.callback import AbstractSourceCallback
|
|
42
|
+
from bizon.source.config import SourceSyncModes
|
|
43
43
|
|
|
44
44
|
from .config import BigQueryStreamingV2ConfigDetails
|
|
45
45
|
from .proto_utils import get_proto_schema_and_class
|
|
@@ -81,6 +81,17 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
81
81
|
tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
82
82
|
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
83
83
|
|
|
84
|
+
@property
|
|
85
|
+
def temp_table_id(self) -> str:
|
|
86
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
87
|
+
return f"{self.table_id}_temp"
|
|
88
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
89
|
+
return f"{self.table_id}_incremental"
|
|
90
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
91
|
+
return f"{self.table_id}"
|
|
92
|
+
# Default fallback
|
|
93
|
+
return f"{self.table_id}"
|
|
94
|
+
|
|
84
95
|
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
85
96
|
if self.config.unnest:
|
|
86
97
|
if len(list(self.record_schemas.keys())) == 1:
|
|
@@ -165,36 +176,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
165
176
|
logger.error(f"Stream name: {stream_name}")
|
|
166
177
|
raise
|
|
167
178
|
|
|
168
|
-
def safe_cast_record_values(self, row: dict):
|
|
169
|
-
"""
|
|
170
|
-
Safe cast record values to the correct type for BigQuery.
|
|
171
|
-
"""
|
|
172
|
-
for col in self.record_schemas[self.destination_id]:
|
|
173
|
-
# Handle dicts as strings
|
|
174
|
-
if col.type in ["STRING", "JSON"]:
|
|
175
|
-
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
176
|
-
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
177
|
-
|
|
178
|
-
# Handle timestamps
|
|
179
|
-
if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
|
|
180
|
-
if isinstance(row[col.name], int):
|
|
181
|
-
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
182
|
-
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
183
|
-
"%Y-%m-%d %H:%M:%S.%f"
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
try:
|
|
187
|
-
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
188
|
-
except ValueError:
|
|
189
|
-
error_message = (
|
|
190
|
-
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
191
|
-
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
192
|
-
"Consider using a transformation."
|
|
193
|
-
)
|
|
194
|
-
logger.error(error_message)
|
|
195
|
-
raise ValueError(error_message)
|
|
196
|
-
return row
|
|
197
|
-
|
|
198
179
|
@staticmethod
|
|
199
180
|
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
200
181
|
"""Convert a row to a Protobuf serialization."""
|
|
@@ -263,14 +244,14 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
263
244
|
deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
|
|
264
245
|
deserialized_rows.append(deserialized_row)
|
|
265
246
|
|
|
266
|
-
# For large rows, we need to use the main client
|
|
247
|
+
# For large rows, we need to use the main client (write to temp_table_id)
|
|
267
248
|
job_config = bigquery.LoadJobConfig(
|
|
268
249
|
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
|
269
|
-
schema=self.bq_client.get_table(self.
|
|
250
|
+
schema=self.bq_client.get_table(self.temp_table_id).schema,
|
|
270
251
|
ignore_unknown_values=True,
|
|
271
252
|
)
|
|
272
253
|
load_job = self.bq_client.load_table_from_json(
|
|
273
|
-
deserialized_rows, self.
|
|
254
|
+
deserialized_rows, self.temp_table_id, job_config=job_config, timeout=300
|
|
274
255
|
)
|
|
275
256
|
result = load_job.result()
|
|
276
257
|
if load_job.state != "DONE":
|
|
@@ -292,9 +273,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
292
273
|
raise
|
|
293
274
|
|
|
294
275
|
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
295
|
-
# Create table if it does not exist
|
|
276
|
+
# Create table if it does not exist (use temp_table_id for staging)
|
|
296
277
|
schema = self.get_bigquery_schema()
|
|
297
|
-
table = bigquery.Table(self.
|
|
278
|
+
table = bigquery.Table(self.temp_table_id, schema=schema)
|
|
298
279
|
time_partitioning = TimePartitioning(
|
|
299
280
|
field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
|
|
300
281
|
)
|
|
@@ -305,7 +286,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
305
286
|
try:
|
|
306
287
|
table = self.bq_client.create_table(table)
|
|
307
288
|
except Conflict:
|
|
308
|
-
table = self.bq_client.get_table(self.
|
|
289
|
+
table = self.bq_client.get_table(self.temp_table_id)
|
|
309
290
|
# Compare and update schema if needed
|
|
310
291
|
existing_fields = {field.name: field for field in table.schema}
|
|
311
292
|
new_fields = {field.name: field for field in self.get_bigquery_schema()}
|
|
@@ -319,12 +300,13 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
319
300
|
table.schema = updated_schema
|
|
320
301
|
table = self.bq_client.update_table(table, ["schema"])
|
|
321
302
|
|
|
322
|
-
# Create the stream
|
|
323
|
-
|
|
324
|
-
|
|
303
|
+
# Create the stream (use temp_table_id for staging)
|
|
304
|
+
temp_table_parts = self.temp_table_id.split(".")
|
|
305
|
+
if len(temp_table_parts) == 3:
|
|
306
|
+
project, dataset, table_name = temp_table_parts
|
|
325
307
|
parent = BigQueryWriteClient.table_path(project, dataset, table_name)
|
|
326
308
|
else:
|
|
327
|
-
parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id,
|
|
309
|
+
parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, temp_table_parts[-1])
|
|
328
310
|
|
|
329
311
|
stream_name = f"{parent}/_default"
|
|
330
312
|
|
|
@@ -333,9 +315,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
333
315
|
|
|
334
316
|
if self.config.unnest:
|
|
335
317
|
serialized_rows = [
|
|
336
|
-
self.to_protobuf_serialization(
|
|
337
|
-
TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
|
|
338
|
-
)
|
|
318
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=orjson.loads(row))
|
|
339
319
|
for row in df_destination_records["source_data"].to_list()
|
|
340
320
|
]
|
|
341
321
|
else:
|
|
@@ -442,3 +422,29 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
442
422
|
if large_rows:
|
|
443
423
|
logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
444
424
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
425
|
+
|
|
426
|
+
def finalize(self):
|
|
427
|
+
"""Finalize the sync by moving data from temp table to main table based on sync mode."""
|
|
428
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
429
|
+
# Replace main table with temp table data
|
|
430
|
+
logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
|
|
431
|
+
self.bq_client.query(
|
|
432
|
+
f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
|
|
433
|
+
).result()
|
|
434
|
+
logger.info(f"Deleting temp table {self.temp_table_id} ...")
|
|
435
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
436
|
+
return True
|
|
437
|
+
|
|
438
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
|
|
439
|
+
# Append data from incremental temp table to main table
|
|
440
|
+
logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
|
|
441
|
+
self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
|
|
442
|
+
logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
|
|
443
|
+
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
444
|
+
return True
|
|
445
|
+
|
|
446
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
447
|
+
# Direct writes, no finalization needed
|
|
448
|
+
return True
|
|
449
|
+
|
|
450
|
+
return True
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
name: dummy to file (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: dummy
|
|
5
|
+
stream: creatures
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Field to filter records by timestamp
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: dummy_key
|
|
12
|
+
|
|
13
|
+
destination:
|
|
14
|
+
name: file
|
|
15
|
+
config:
|
|
16
|
+
format: json
|
|
17
|
+
|
|
18
|
+
# How incremental sync works with file destination:
|
|
19
|
+
# 1. First run: Behaves like full_refresh (creates new file)
|
|
20
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
21
|
+
# 3. New records are appended to the existing JSON file
|
|
22
|
+
# 4. Writes to temp file (_incremental.json) then appends to main file on finalize
|
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
1
3
|
from typing import Tuple
|
|
2
4
|
|
|
3
5
|
import orjson
|
|
4
6
|
import polars as pl
|
|
7
|
+
from loguru import logger
|
|
5
8
|
|
|
6
9
|
from bizon.common.models import SyncMetadata
|
|
7
10
|
from bizon.destination.destination import AbstractDestination
|
|
8
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
12
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
10
13
|
from bizon.source.callback import AbstractSourceCallback
|
|
14
|
+
from bizon.source.config import SourceSyncModes
|
|
11
15
|
|
|
12
16
|
from .config import FileDestinationDetailsConfig
|
|
13
17
|
|
|
@@ -24,6 +28,30 @@ class FileDestination(AbstractDestination):
|
|
|
24
28
|
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
25
29
|
self.config: FileDestinationDetailsConfig = config
|
|
26
30
|
|
|
31
|
+
@property
|
|
32
|
+
def file_path(self) -> str:
|
|
33
|
+
"""Main output file path."""
|
|
34
|
+
return f"{self.destination_id}.json"
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def temp_file_path(self) -> str:
|
|
38
|
+
"""Temp file path for FULL_REFRESH mode."""
|
|
39
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
40
|
+
return f"{self.destination_id}_temp.json"
|
|
41
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
42
|
+
return f"{self.destination_id}_incremental.json"
|
|
43
|
+
return self.file_path
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def write_path(self) -> str:
|
|
47
|
+
"""Get the path to write to based on sync mode."""
|
|
48
|
+
if self.sync_metadata.sync_mode in [
|
|
49
|
+
SourceSyncModes.FULL_REFRESH.value,
|
|
50
|
+
SourceSyncModes.INCREMENTAL.value,
|
|
51
|
+
]:
|
|
52
|
+
return self.temp_file_path
|
|
53
|
+
return self.file_path
|
|
54
|
+
|
|
27
55
|
def check_connection(self) -> bool:
|
|
28
56
|
return True
|
|
29
57
|
|
|
@@ -34,7 +62,7 @@ class FileDestination(AbstractDestination):
|
|
|
34
62
|
if self.config.unnest:
|
|
35
63
|
schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
|
|
36
64
|
|
|
37
|
-
with open(
|
|
65
|
+
with open(self.write_path, "a") as f:
|
|
38
66
|
for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
|
|
39
67
|
assert set(value.keys()) == schema_keys, "Keys do not match the schema"
|
|
40
68
|
|
|
@@ -46,6 +74,35 @@ class FileDestination(AbstractDestination):
|
|
|
46
74
|
f.write(f"{orjson.dumps(row).decode('utf-8')}\n")
|
|
47
75
|
|
|
48
76
|
else:
|
|
49
|
-
|
|
77
|
+
# Append mode for incremental, overwrite for full refresh on first write
|
|
78
|
+
with open(self.write_path, "a") as f:
|
|
79
|
+
for record in df_destination_records.iter_rows(named=True):
|
|
80
|
+
f.write(f"{orjson.dumps(record).decode('utf-8')}\n")
|
|
50
81
|
|
|
51
82
|
return True, ""
|
|
83
|
+
|
|
84
|
+
def finalize(self) -> bool:
|
|
85
|
+
"""Finalize the sync by moving temp file to main file based on sync mode."""
|
|
86
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
87
|
+
# Replace main file with temp file
|
|
88
|
+
if os.path.exists(self.temp_file_path):
|
|
89
|
+
logger.info(f"File destination: Moving {self.temp_file_path} to {self.file_path}")
|
|
90
|
+
shutil.move(self.temp_file_path, self.file_path)
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
94
|
+
# Append temp file contents to main file
|
|
95
|
+
if os.path.exists(self.temp_file_path):
|
|
96
|
+
logger.info(f"File destination: Appending {self.temp_file_path} to {self.file_path}")
|
|
97
|
+
with open(self.file_path, "a") as main_file:
|
|
98
|
+
with open(self.temp_file_path) as temp_file:
|
|
99
|
+
main_file.write(temp_file.read())
|
|
100
|
+
os.remove(self.temp_file_path)
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
|
|
104
|
+
# Direct writes, no finalization needed
|
|
105
|
+
logger.info("File destination: STREAM sync batch completed")
|
|
106
|
+
return True
|
|
107
|
+
|
|
108
|
+
return True
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: dummy to logger (incremental)
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: dummy
|
|
5
|
+
stream: creatures
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Field to filter records by timestamp
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: dummy_key
|
|
12
|
+
|
|
13
|
+
destination:
|
|
14
|
+
name: logger
|
|
15
|
+
config:
|
|
16
|
+
dummy: dummy
|
|
17
|
+
|
|
18
|
+
# How incremental sync works:
|
|
19
|
+
# 1. First run: Behaves like full_refresh (fetches all data)
|
|
20
|
+
# 2. Subsequent runs: Only fetches records where cursor_field > last_run
|
|
21
|
+
# 3. Logger outputs records with [incremental] prefix for easy identification
|
|
@@ -8,6 +8,7 @@ from bizon.destination.destination import AbstractDestination
|
|
|
8
8
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
9
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
10
10
|
from bizon.source.callback import AbstractSourceCallback
|
|
11
|
+
from bizon.source.config import SourceSyncModes
|
|
11
12
|
|
|
12
13
|
from .config import LoggerDestinationConfig
|
|
13
14
|
|
|
@@ -36,6 +37,17 @@ class LoggerDestination(AbstractDestination):
|
|
|
36
37
|
return True
|
|
37
38
|
|
|
38
39
|
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
40
|
+
sync_mode_label = f"[{self.sync_metadata.sync_mode}]" if self.sync_metadata.sync_mode else ""
|
|
39
41
|
for record in df_destination_records.iter_rows(named=True):
|
|
40
|
-
logger.info(record[
|
|
42
|
+
logger.info(f"{sync_mode_label} {record['source_data']}")
|
|
41
43
|
return True, ""
|
|
44
|
+
|
|
45
|
+
def finalize(self) -> bool:
|
|
46
|
+
"""Finalize the sync - logs completion message based on sync mode."""
|
|
47
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
|
|
48
|
+
logger.info("Logger destination: FULL_REFRESH sync completed")
|
|
49
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
|
|
50
|
+
logger.info("Logger destination: INCREMENTAL sync completed (records appended)")
|
|
51
|
+
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
|
|
52
|
+
logger.info("Logger destination: STREAM sync batch completed")
|
|
53
|
+
return True
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: gsheets incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: gsheets
|
|
5
|
+
stream: worksheet
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updated_at # Column name in your sheet containing timestamps
|
|
8
|
+
spreadsheet_url: <MY_SPREADSHEET_URL>
|
|
9
|
+
worksheet_name: Sheet1
|
|
10
|
+
service_account_key: >-
|
|
11
|
+
{
|
|
12
|
+
"type": "service_account",
|
|
13
|
+
"project_id": "<MY_GCP_PROJECT>",
|
|
14
|
+
"private_key_id": "xxx",
|
|
15
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
|
|
16
|
+
"client_email": "bizon@<MY_GCP_PROJECT>.iam.gserviceaccount.com",
|
|
17
|
+
"client_id": "999999999999",
|
|
18
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
19
|
+
"token_uri": "https://oauth2.googleapis.com/token",
|
|
20
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
21
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...",
|
|
22
|
+
"universe_domain": "googleapis.com"
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
destination:
|
|
26
|
+
name: bigquery
|
|
27
|
+
config:
|
|
28
|
+
project_id: <MY_GCP_PROJECT>
|
|
29
|
+
dataset_id: gsheets_data
|
|
30
|
+
dataset_location: US
|
|
31
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
32
|
+
gcs_buffer_format: parquet
|
|
33
|
+
|
|
34
|
+
engine:
|
|
35
|
+
backend:
|
|
36
|
+
type: bigquery
|
|
37
|
+
database: <MY_GCP_PROJECT>
|
|
38
|
+
schema: bizon_backend
|
|
39
|
+
syncCursorInDBEvery: 2
|
|
40
|
+
|
|
41
|
+
# Incremental sync for Google Sheets:
|
|
42
|
+
# - First run: Fetches all rows (full refresh behavior)
|
|
43
|
+
# - Subsequent runs: Only fetches rows where cursor_field > last_run
|
|
44
|
+
#
|
|
45
|
+
# IMPORTANT: Your Google Sheet must have a timestamp column for incremental sync.
|
|
46
|
+
# Common patterns:
|
|
47
|
+
# - Add an "updated_at" column with formula: =NOW() (updates on edit)
|
|
48
|
+
# - Use Google Apps Script to auto-update timestamps on row changes
|
|
49
|
+
# - Manually maintain a "last_modified" column
|
|
50
|
+
#
|
|
51
|
+
# If your sheet doesn't have timestamps, use sync_mode: full_refresh instead.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: hubspot contacts incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: hubspot
|
|
5
|
+
stream: contacts
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: updatedAt # HubSpot's timestamp field for contacts
|
|
8
|
+
properties:
|
|
9
|
+
strategy: all
|
|
10
|
+
authentication:
|
|
11
|
+
type: api_key
|
|
12
|
+
params:
|
|
13
|
+
token: <MY_API_KEY>
|
|
14
|
+
|
|
15
|
+
destination:
|
|
16
|
+
name: bigquery
|
|
17
|
+
config:
|
|
18
|
+
project_id: <MY_GCP_PROJECT>
|
|
19
|
+
dataset_id: hubspot_data
|
|
20
|
+
dataset_location: US
|
|
21
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
22
|
+
gcs_buffer_format: parquet
|
|
23
|
+
|
|
24
|
+
engine:
|
|
25
|
+
backend:
|
|
26
|
+
type: bigquery
|
|
27
|
+
database: <MY_GCP_PROJECT>
|
|
28
|
+
schema: bizon_backend
|
|
29
|
+
syncCursorInDBEvery: 2
|
|
30
|
+
|
|
31
|
+
# Incremental sync for HubSpot:
|
|
32
|
+
# - First run: Fetches all contacts (full refresh behavior)
|
|
33
|
+
# - Subsequent runs: Only fetches contacts where updatedAt > last_run
|
|
34
|
+
#
|
|
35
|
+
# Common cursor fields by stream:
|
|
36
|
+
# - contacts: updatedAt
|
|
37
|
+
# - companies: updatedAt
|
|
38
|
+
# - deals: updatedAt
|
|
39
|
+
# - tickets: updatedAt
|
|
40
|
+
# - products: updatedAt
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: notion pages incremental sync
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: notion
|
|
5
|
+
stream: pages # Options: databases, data_sources, pages, blocks, users
|
|
6
|
+
sync_mode: incremental
|
|
7
|
+
cursor_field: last_edited_time # Notion's timestamp field
|
|
8
|
+
authentication:
|
|
9
|
+
type: api_key
|
|
10
|
+
params:
|
|
11
|
+
token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
|
|
12
|
+
|
|
13
|
+
# List of database IDs to fetch data from
|
|
14
|
+
database_ids:
|
|
15
|
+
- "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
16
|
+
|
|
17
|
+
# Number of results per API call (1-100, default: 100)
|
|
18
|
+
page_size: 100
|
|
19
|
+
|
|
20
|
+
destination:
|
|
21
|
+
name: bigquery
|
|
22
|
+
config:
|
|
23
|
+
project_id: <MY_GCP_PROJECT>
|
|
24
|
+
dataset_id: notion_data
|
|
25
|
+
dataset_location: US
|
|
26
|
+
gcs_buffer_bucket: <MY_GCS_BUCKET>
|
|
27
|
+
gcs_buffer_format: parquet
|
|
28
|
+
|
|
29
|
+
engine:
|
|
30
|
+
backend:
|
|
31
|
+
type: bigquery
|
|
32
|
+
database: <MY_GCP_PROJECT>
|
|
33
|
+
schema: bizon_backend
|
|
34
|
+
syncCursorInDBEvery: 2
|
|
35
|
+
|
|
36
|
+
# Incremental sync for Notion:
|
|
37
|
+
# - First run: Fetches all pages/databases (full refresh behavior)
|
|
38
|
+
# - Subsequent runs: Only fetches items where last_edited_time > last_run
|
|
39
|
+
#
|
|
40
|
+
# Supported streams for incremental sync:
|
|
41
|
+
# - pages, all_pages: Uses Search API with last_edited_time filter
|
|
42
|
+
# - databases, all_databases: Uses Search API to find updated data_sources
|
|
43
|
+
# - blocks: First finds updated pages, then fetches their blocks
|
|
44
|
+
# - blocks_markdown, all_blocks_markdown: Same as blocks, converts to markdown
|
|
45
|
+
#
|
|
46
|
+
# Not supported (falls back to full refresh):
|
|
47
|
+
# - users: No timestamp filter available
|
|
48
|
+
# - data_sources: Use databases stream instead
|
|
@@ -10,7 +10,7 @@ from urllib3.util.retry import Retry
|
|
|
10
10
|
from bizon.source.auth.builder import AuthBuilder
|
|
11
11
|
from bizon.source.auth.config import AuthType
|
|
12
12
|
from bizon.source.config import SourceConfig
|
|
13
|
-
from bizon.source.models import SourceIteration, SourceRecord
|
|
13
|
+
from bizon.source.models import SourceIncrementalState, SourceIteration, SourceRecord
|
|
14
14
|
from bizon.source.source import AbstractSource
|
|
15
15
|
|
|
16
16
|
from .config import NotionSourceConfig, NotionStreams
|
|
@@ -1132,6 +1132,348 @@ class NotionSource(AbstractSource):
|
|
|
1132
1132
|
|
|
1133
1133
|
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1134
1134
|
|
|
1135
|
+
# ==================== INCREMENTAL SYNC ====================
|
|
1136
|
+
|
|
1137
|
+
def search_with_filter(
|
|
1138
|
+
self, start_cursor: str = None, last_edited_after: str = None, object_type: str = None
|
|
1139
|
+
) -> dict:
|
|
1140
|
+
"""
|
|
1141
|
+
Search with optional last_edited_time filter for incremental sync.
|
|
1142
|
+
|
|
1143
|
+
Note: Notion Search API doesn't support timestamp filtering directly.
|
|
1144
|
+
We sort by last_edited_time descending and filter client-side.
|
|
1145
|
+
|
|
1146
|
+
Args:
|
|
1147
|
+
start_cursor: Pagination cursor
|
|
1148
|
+
last_edited_after: ISO 8601 timestamp to filter by last_edited_time
|
|
1149
|
+
object_type: Optional filter by object type ("page" or "database")
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
Search results filtered by timestamp
|
|
1153
|
+
"""
|
|
1154
|
+
payload = {"page_size": self.config.page_size}
|
|
1155
|
+
if start_cursor:
|
|
1156
|
+
payload["start_cursor"] = start_cursor
|
|
1157
|
+
|
|
1158
|
+
# Sort by last_edited_time descending to get most recent first
|
|
1159
|
+
if last_edited_after:
|
|
1160
|
+
payload["sort"] = {"direction": "descending", "timestamp": "last_edited_time"}
|
|
1161
|
+
|
|
1162
|
+
response = self.session.post(f"{BASE_URL}/search", json=payload)
|
|
1163
|
+
response.raise_for_status()
|
|
1164
|
+
result = response.json()
|
|
1165
|
+
|
|
1166
|
+
# Filter by object_type client-side if specified
|
|
1167
|
+
if object_type:
|
|
1168
|
+
result["results"] = [item for item in result.get("results", []) if item.get("object") == object_type]
|
|
1169
|
+
|
|
1170
|
+
# Filter by last_edited_time client-side
|
|
1171
|
+
# Since results are sorted descending, stop when we hit an old item
|
|
1172
|
+
if last_edited_after:
|
|
1173
|
+
filtered_results = []
|
|
1174
|
+
found_old_item = False
|
|
1175
|
+
for item in result.get("results", []):
|
|
1176
|
+
item_edited_time = item.get("last_edited_time", "")
|
|
1177
|
+
if item_edited_time > last_edited_after:
|
|
1178
|
+
filtered_results.append(item)
|
|
1179
|
+
else:
|
|
1180
|
+
found_old_item = True
|
|
1181
|
+
break
|
|
1182
|
+
|
|
1183
|
+
result["results"] = filtered_results
|
|
1184
|
+
# If we found an old item, no need to paginate further
|
|
1185
|
+
if found_old_item:
|
|
1186
|
+
result["has_more"] = False
|
|
1187
|
+
|
|
1188
|
+
return result
|
|
1189
|
+
|
|
1190
|
+
def get_pages_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
|
|
1191
|
+
"""
|
|
1192
|
+
Fetch pages updated after source_state.last_run using the Search API with timestamp filter.
|
|
1193
|
+
"""
|
|
1194
|
+
cursor = pagination.get("start_cursor") if pagination else None
|
|
1195
|
+
last_edited_after = source_state.last_run.isoformat()
|
|
1196
|
+
|
|
1197
|
+
result = self.search_with_filter(start_cursor=cursor, last_edited_after=last_edited_after, object_type="page")
|
|
1198
|
+
|
|
1199
|
+
records = [SourceRecord(id=page["id"], data=page) for page in result.get("results", [])]
|
|
1200
|
+
|
|
1201
|
+
logger.info(f"Incremental sync: fetched {len(records)} pages updated after {last_edited_after}")
|
|
1202
|
+
|
|
1203
|
+
next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
|
|
1204
|
+
|
|
1205
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1206
|
+
|
|
1207
|
+
def get_all_pages_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
|
|
1208
|
+
"""
|
|
1209
|
+
Fetch all pages accessible to the integration updated after source_state.last_run.
|
|
1210
|
+
Same as get_pages_after but without database_ids filter.
|
|
1211
|
+
"""
|
|
1212
|
+
return self.get_pages_after(source_state, pagination)
|
|
1213
|
+
|
|
1214
|
+
def get_databases_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
|
|
1215
|
+
"""
|
|
1216
|
+
Fetch databases updated after source_state.last_run.
|
|
1217
|
+
"""
|
|
1218
|
+
cursor = pagination.get("start_cursor") if pagination else None
|
|
1219
|
+
last_edited_after = source_state.last_run.isoformat()
|
|
1220
|
+
|
|
1221
|
+
# Search for data_sources (databases don't appear directly in search in 2025-09-03 API)
|
|
1222
|
+
result = self.search_with_filter(
|
|
1223
|
+
start_cursor=cursor, last_edited_after=last_edited_after, object_type="data_source"
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
# Extract unique database IDs from data_sources
|
|
1227
|
+
seen_db_ids = set()
|
|
1228
|
+
records = []
|
|
1229
|
+
for ds in result.get("results", []):
|
|
1230
|
+
parent = ds.get("parent", {})
|
|
1231
|
+
if parent.get("type") == "database_id":
|
|
1232
|
+
db_id = parent.get("database_id")
|
|
1233
|
+
if db_id and db_id not in seen_db_ids:
|
|
1234
|
+
seen_db_ids.add(db_id)
|
|
1235
|
+
try:
|
|
1236
|
+
db_data = self.get_database(db_id)
|
|
1237
|
+
records.append(SourceRecord(id=db_data["id"], data=db_data))
|
|
1238
|
+
except Exception as e:
|
|
1239
|
+
logger.error(f"Failed to fetch database {db_id}: {e}")
|
|
1240
|
+
|
|
1241
|
+
logger.info(f"Incremental sync: fetched {len(records)} databases updated after {last_edited_after}")
|
|
1242
|
+
|
|
1243
|
+
next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
|
|
1244
|
+
|
|
1245
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1246
|
+
|
|
1247
|
+
def get_blocks_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
|
|
1248
|
+
"""
|
|
1249
|
+
Fetch blocks from pages updated after source_state.last_run.
|
|
1250
|
+
First finds updated pages, then fetches their blocks.
|
|
1251
|
+
"""
|
|
1252
|
+
if pagination:
|
|
1253
|
+
items_to_process = pagination.get("items_to_process", [])
|
|
1254
|
+
items_loaded = pagination.get("items_loaded", False)
|
|
1255
|
+
search_cursor = pagination.get("search_cursor")
|
|
1256
|
+
else:
|
|
1257
|
+
items_to_process = []
|
|
1258
|
+
items_loaded = False
|
|
1259
|
+
search_cursor = None
|
|
1260
|
+
|
|
1261
|
+
last_edited_after = source_state.last_run.isoformat()
|
|
1262
|
+
|
|
1263
|
+
# Collect pages updated after last_run
|
|
1264
|
+
if not items_loaded:
|
|
1265
|
+
while True:
|
|
1266
|
+
result = self.search_with_filter(
|
|
1267
|
+
start_cursor=search_cursor, last_edited_after=last_edited_after, object_type="page"
|
|
1268
|
+
)
|
|
1269
|
+
for page in result.get("results", []):
|
|
1270
|
+
items_to_process.append(
|
|
1271
|
+
{
|
|
1272
|
+
"block_id": page["id"],
|
|
1273
|
+
"input_db_id": None,
|
|
1274
|
+
"input_page_id": None,
|
|
1275
|
+
"source_page_id": page["id"],
|
|
1276
|
+
}
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
if result.get("has_more"):
|
|
1280
|
+
search_cursor = result.get("next_cursor")
|
|
1281
|
+
else:
|
|
1282
|
+
break
|
|
1283
|
+
|
|
1284
|
+
items_loaded = True
|
|
1285
|
+
logger.info(f"Incremental sync: found {len(items_to_process)} pages updated after {last_edited_after}")
|
|
1286
|
+
|
|
1287
|
+
if not items_to_process:
|
|
1288
|
+
return SourceIteration(records=[], next_pagination={})
|
|
1289
|
+
|
|
1290
|
+
# Process a batch in parallel
|
|
1291
|
+
batch_size = self.config.max_workers
|
|
1292
|
+
batch = items_to_process[:batch_size]
|
|
1293
|
+
items_to_process = items_to_process[batch_size:]
|
|
1294
|
+
|
|
1295
|
+
records = []
|
|
1296
|
+
|
|
1297
|
+
def fetch_item_blocks(item_info: dict) -> List[dict]:
|
|
1298
|
+
return self.fetch_blocks_recursively(
|
|
1299
|
+
block_id=item_info["block_id"],
|
|
1300
|
+
parent_input_database_id=item_info["input_db_id"],
|
|
1301
|
+
parent_input_page_id=item_info["input_page_id"],
|
|
1302
|
+
source_page_id=item_info["source_page_id"],
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
1306
|
+
futures = {executor.submit(fetch_item_blocks, item_info): item_info for item_info in batch}
|
|
1307
|
+
for future in as_completed(futures):
|
|
1308
|
+
item_info = futures[future]
|
|
1309
|
+
try:
|
|
1310
|
+
blocks = future.result()
|
|
1311
|
+
for block in blocks:
|
|
1312
|
+
records.append(SourceRecord(id=block["id"], data=block))
|
|
1313
|
+
except Exception as e:
|
|
1314
|
+
logger.error(f"Failed to fetch blocks from {item_info['block_id']}: {e}")
|
|
1315
|
+
|
|
1316
|
+
next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
|
|
1317
|
+
|
|
1318
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1319
|
+
|
|
1320
|
+
def get_blocks_markdown_after(
|
|
1321
|
+
self, source_state: SourceIncrementalState, pagination: dict = None
|
|
1322
|
+
) -> SourceIteration:
|
|
1323
|
+
"""
|
|
1324
|
+
Fetch blocks from pages updated after source_state.last_run and convert to markdown.
|
|
1325
|
+
Respects database_ids and database_filters configuration.
|
|
1326
|
+
"""
|
|
1327
|
+
if pagination:
|
|
1328
|
+
items_to_process = pagination.get("items_to_process", [])
|
|
1329
|
+
items_loaded = pagination.get("items_loaded", False)
|
|
1330
|
+
else:
|
|
1331
|
+
items_to_process = []
|
|
1332
|
+
items_loaded = False
|
|
1333
|
+
|
|
1334
|
+
last_edited_after = source_state.last_run.isoformat()
|
|
1335
|
+
|
|
1336
|
+
# Collect pages updated after last_run from configured databases
|
|
1337
|
+
if not items_loaded:
|
|
1338
|
+
# Query each configured database with timestamp filter
|
|
1339
|
+
for db_id in self.config.database_ids:
|
|
1340
|
+
try:
|
|
1341
|
+
db_data = self.get_database(db_id)
|
|
1342
|
+
db_filter = self.get_filter_for_database(db_id)
|
|
1343
|
+
|
|
1344
|
+
for ds in db_data.get("data_sources", []):
|
|
1345
|
+
ds_cursor = None
|
|
1346
|
+
while True:
|
|
1347
|
+
# Build filter with last_edited_time constraint
|
|
1348
|
+
incremental_filter = {
|
|
1349
|
+
"timestamp": "last_edited_time",
|
|
1350
|
+
"last_edited_time": {"after": last_edited_after},
|
|
1351
|
+
}
|
|
1352
|
+
# Combine with existing database filter if present
|
|
1353
|
+
if db_filter:
|
|
1354
|
+
combined_filter = {"and": [incremental_filter, db_filter]}
|
|
1355
|
+
else:
|
|
1356
|
+
combined_filter = incremental_filter
|
|
1357
|
+
|
|
1358
|
+
result = self.query_data_source(ds["id"], ds_cursor, filter=combined_filter)
|
|
1359
|
+
for page in result.get("results", []):
|
|
1360
|
+
items_to_process.append(
|
|
1361
|
+
{
|
|
1362
|
+
"block_id": page["id"],
|
|
1363
|
+
"input_db_id": db_id,
|
|
1364
|
+
"input_page_id": None,
|
|
1365
|
+
"source_page_id": page["id"],
|
|
1366
|
+
}
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
if result.get("has_more"):
|
|
1370
|
+
ds_cursor = result.get("next_cursor")
|
|
1371
|
+
else:
|
|
1372
|
+
break
|
|
1373
|
+
except Exception as e:
|
|
1374
|
+
logger.error(f"Failed to query database {db_id} for incremental sync: {e}")
|
|
1375
|
+
|
|
1376
|
+
# Also check configured page_ids (filter by last_edited_time)
|
|
1377
|
+
for page_id in self.config.page_ids:
|
|
1378
|
+
try:
|
|
1379
|
+
page_data = self.get_page(page_id)
|
|
1380
|
+
if page_data.get("last_edited_time", "") > last_edited_after:
|
|
1381
|
+
items_to_process.append(
|
|
1382
|
+
{
|
|
1383
|
+
"block_id": page_id,
|
|
1384
|
+
"input_db_id": None,
|
|
1385
|
+
"input_page_id": page_id,
|
|
1386
|
+
"source_page_id": page_id,
|
|
1387
|
+
}
|
|
1388
|
+
)
|
|
1389
|
+
except Exception as e:
|
|
1390
|
+
logger.error(f"Failed to fetch page {page_id} for incremental sync: {e}")
|
|
1391
|
+
|
|
1392
|
+
items_loaded = True
|
|
1393
|
+
logger.info(
|
|
1394
|
+
f"Incremental sync: found {len(items_to_process)} pages for blocks_markdown after {last_edited_after}"
|
|
1395
|
+
)
|
|
1396
|
+
|
|
1397
|
+
if not items_to_process:
|
|
1398
|
+
return SourceIteration(records=[], next_pagination={})
|
|
1399
|
+
|
|
1400
|
+
# Process a batch in parallel
|
|
1401
|
+
batch_size = self.config.max_workers
|
|
1402
|
+
batch = items_to_process[:batch_size]
|
|
1403
|
+
items_to_process = items_to_process[batch_size:]
|
|
1404
|
+
|
|
1405
|
+
records = []
|
|
1406
|
+
|
|
1407
|
+
def fetch_and_convert_item(item_info: dict) -> List[dict]:
|
|
1408
|
+
blocks = self.fetch_blocks_recursively(
|
|
1409
|
+
block_id=item_info["block_id"],
|
|
1410
|
+
parent_input_database_id=item_info["input_db_id"],
|
|
1411
|
+
parent_input_page_id=item_info["input_page_id"],
|
|
1412
|
+
source_page_id=item_info["source_page_id"],
|
|
1413
|
+
fetch_child_databases=False,
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
block_records = []
|
|
1417
|
+
for block in blocks or []:
|
|
1418
|
+
if not block:
|
|
1419
|
+
continue
|
|
1420
|
+
md = self._block_to_markdown(block)
|
|
1421
|
+
block_records.append(
|
|
1422
|
+
{
|
|
1423
|
+
"block_id": block.get("id"),
|
|
1424
|
+
"block_type": block.get("type"),
|
|
1425
|
+
"markdown": md,
|
|
1426
|
+
"source_page_id": block.get("source_page_id"),
|
|
1427
|
+
"parent_block_id": block.get("parent_block_id"),
|
|
1428
|
+
"parent_input_database_id": block.get("parent_input_database_id"),
|
|
1429
|
+
"parent_input_page_id": block.get("parent_input_page_id"),
|
|
1430
|
+
"depth": block.get("depth"),
|
|
1431
|
+
"block_order": block.get("block_order"),
|
|
1432
|
+
"page_order": block.get("page_order"),
|
|
1433
|
+
"block_raw": block,
|
|
1434
|
+
}
|
|
1435
|
+
)
|
|
1436
|
+
return block_records
|
|
1437
|
+
|
|
1438
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
1439
|
+
futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
|
|
1440
|
+
for future in as_completed(futures):
|
|
1441
|
+
item_info = futures[future]
|
|
1442
|
+
try:
|
|
1443
|
+
block_records = future.result()
|
|
1444
|
+
for block_record in block_records:
|
|
1445
|
+
records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
|
|
1446
|
+
except Exception as e:
|
|
1447
|
+
logger.error(f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}")
|
|
1448
|
+
|
|
1449
|
+
next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
|
|
1450
|
+
|
|
1451
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1452
|
+
|
|
1453
|
+
def get_records_after(self, source_state: SourceIncrementalState, pagination: dict = None) -> SourceIteration:
|
|
1454
|
+
"""
|
|
1455
|
+
Fetch records updated after source_state.last_run for incremental sync.
|
|
1456
|
+
|
|
1457
|
+
Supported streams:
|
|
1458
|
+
- pages, all_pages: Uses Search API with last_edited_time filter
|
|
1459
|
+
- databases, all_databases: Uses Search API to find updated data_sources
|
|
1460
|
+
- blocks, all_blocks_markdown: First finds updated pages, then fetches their blocks
|
|
1461
|
+
"""
|
|
1462
|
+
stream = self.config.stream
|
|
1463
|
+
|
|
1464
|
+
if stream in [NotionStreams.PAGES, NotionStreams.ALL_PAGES]:
|
|
1465
|
+
return self.get_pages_after(source_state, pagination)
|
|
1466
|
+
elif stream in [NotionStreams.DATABASES, NotionStreams.ALL_DATABASES]:
|
|
1467
|
+
return self.get_databases_after(source_state, pagination)
|
|
1468
|
+
elif stream == NotionStreams.BLOCKS:
|
|
1469
|
+
return self.get_blocks_after(source_state, pagination)
|
|
1470
|
+
elif stream in [NotionStreams.BLOCKS_MARKDOWN, NotionStreams.ALL_BLOCKS_MARKDOWN]:
|
|
1471
|
+
return self.get_blocks_markdown_after(source_state, pagination)
|
|
1472
|
+
else:
|
|
1473
|
+
# For streams that don't support incremental, fall back to full refresh
|
|
1474
|
+
logger.warning(f"Stream {stream} does not support incremental sync, falling back to full refresh")
|
|
1475
|
+
return self.get(pagination)
|
|
1476
|
+
|
|
1135
1477
|
# ==================== MAIN DISPATCH ====================
|
|
1136
1478
|
|
|
1137
1479
|
def get(self, pagination: dict = None) -> SourceIteration:
|
|
@@ -14,7 +14,9 @@ from bizon.common.models import BizonConfig
|
|
|
14
14
|
from bizon.engine.backend.backend import AbstractBackend
|
|
15
15
|
from bizon.engine.backend.models import CursorStatus
|
|
16
16
|
from bizon.engine.queue.queue import AbstractQueue
|
|
17
|
+
from bizon.source.config import SourceSyncModes
|
|
17
18
|
from bizon.source.cursor import Cursor
|
|
19
|
+
from bizon.source.models import SourceIncrementalState
|
|
18
20
|
from bizon.source.source import AbstractSource
|
|
19
21
|
|
|
20
22
|
from .models import PipelineReturnStatus
|
|
@@ -130,6 +132,37 @@ class Producer:
|
|
|
130
132
|
self.queue.terminate(iteration=0)
|
|
131
133
|
return PipelineReturnStatus.BACKEND_ERROR
|
|
132
134
|
|
|
135
|
+
# Handle incremental sync mode
|
|
136
|
+
source_incremental_state = None
|
|
137
|
+
is_incremental = self.bizon_config.source.sync_mode == SourceSyncModes.INCREMENTAL
|
|
138
|
+
|
|
139
|
+
if is_incremental:
|
|
140
|
+
# Get the last successful job to determine last_run timestamp
|
|
141
|
+
last_successful_job = self.backend.get_last_successful_stream_job(
|
|
142
|
+
name=self.bizon_config.name,
|
|
143
|
+
source_name=self.bizon_config.source.name,
|
|
144
|
+
stream_name=self.bizon_config.source.stream,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if last_successful_job:
|
|
148
|
+
# Create incremental state with last_run from previous job
|
|
149
|
+
source_incremental_state = SourceIncrementalState(
|
|
150
|
+
last_run=last_successful_job.created_at,
|
|
151
|
+
state={},
|
|
152
|
+
cursor_field=self.bizon_config.source.cursor_field,
|
|
153
|
+
)
|
|
154
|
+
logger.info(
|
|
155
|
+
f"Incremental sync: fetching records after {source_incremental_state.last_run} "
|
|
156
|
+
f"using cursor_field: {source_incremental_state.cursor_field}"
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
# First incremental run - fall back to full refresh behavior
|
|
160
|
+
logger.info(
|
|
161
|
+
"Incremental sync: No previous successful job found. "
|
|
162
|
+
"Falling back to full refresh behavior for first run."
|
|
163
|
+
)
|
|
164
|
+
is_incremental = False
|
|
165
|
+
|
|
133
166
|
while not cursor.is_finished:
|
|
134
167
|
if stop_event.is_set():
|
|
135
168
|
logger.info("Stop event is set, terminating producer ...")
|
|
@@ -180,7 +213,15 @@ class Producer:
|
|
|
180
213
|
|
|
181
214
|
# Get the next data
|
|
182
215
|
try:
|
|
183
|
-
|
|
216
|
+
if is_incremental and source_incremental_state:
|
|
217
|
+
# Use incremental fetching with get_records_after
|
|
218
|
+
source_iteration = self.source.get_records_after(
|
|
219
|
+
source_state=source_incremental_state,
|
|
220
|
+
pagination=cursor.pagination,
|
|
221
|
+
)
|
|
222
|
+
else:
|
|
223
|
+
# Use standard fetching with get
|
|
224
|
+
source_iteration = self.source.get(pagination=cursor.pagination)
|
|
184
225
|
except Exception as e:
|
|
185
226
|
logger.error(traceback.format_exc())
|
|
186
227
|
logger.error(
|
bizon/source/config.py
CHANGED
|
@@ -42,6 +42,12 @@ class SourceConfig(BaseModel, ABC):
|
|
|
42
42
|
default=SourceSyncModes.FULL_REFRESH,
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
+
cursor_field: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Field name to use for incremental filtering (e.g., 'updated_at', 'modified_at'). "
|
|
48
|
+
"Source will fetch records where this field > last_run timestamp.",
|
|
49
|
+
)
|
|
50
|
+
|
|
45
51
|
force_ignore_checkpoint: bool = Field(
|
|
46
52
|
description="Whether to force recreate the sync from iteration 0. Existing checkpoints will be ignored.",
|
|
47
53
|
default=False,
|
bizon/source/models.py
CHANGED
|
@@ -44,4 +44,5 @@ class SourceIteration(BaseModel):
|
|
|
44
44
|
|
|
45
45
|
class SourceIncrementalState(BaseModel):
|
|
46
46
|
last_run: datetime = Field(..., description="Timestamp of the last successful run")
|
|
47
|
-
state: dict = Field(
|
|
47
|
+
state: dict = Field(default_factory=dict, description="Incremental state information from the latest sync")
|
|
48
|
+
cursor_field: Optional[str] = Field(default=None, description="The field name to filter records by timestamp")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bizon
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
5
|
Author-email: Antoine Balliet <antoine.balliet@gmail.com>, Anas El Mhamdi <anas.elmhamdi@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -153,6 +153,130 @@ Runner is the interface used by Bizon to run the pipeline. It can be configured
|
|
|
153
153
|
- `process` (asynchronous)
|
|
154
154
|
- `stream` (synchronous)
|
|
155
155
|
|
|
156
|
+
## Sync Modes
|
|
157
|
+
|
|
158
|
+
Bizon supports three sync modes:
|
|
159
|
+
- `full_refresh`: Re-syncs all data from scratch on each run
|
|
160
|
+
- `incremental`: Syncs only new/updated data since the last successful run
|
|
161
|
+
- `stream`: Continuous streaming mode for real-time data (e.g., Kafka)
|
|
162
|
+
|
|
163
|
+
### Incremental Sync
|
|
164
|
+
|
|
165
|
+
Incremental sync fetches only new or updated records since the last successful run, using an **append-only** strategy.
|
|
166
|
+
|
|
167
|
+
#### Configuration
|
|
168
|
+
|
|
169
|
+
```yaml
|
|
170
|
+
source:
|
|
171
|
+
name: your_source
|
|
172
|
+
stream: your_stream
|
|
173
|
+
sync_mode: incremental
|
|
174
|
+
cursor_field: updated_at # The timestamp field to filter records by
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
#### How It Works
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
181
|
+
│ INCREMENTAL SYNC FLOW │
|
|
182
|
+
├─────────────────────────────────────────────────────────────────────┤
|
|
183
|
+
│ │
|
|
184
|
+
│ 1. Producer checks for last successful job │
|
|
185
|
+
│ └─> Backend.get_last_successful_stream_job() │
|
|
186
|
+
│ │
|
|
187
|
+
│ 2. If found, creates SourceIncrementalState: │
|
|
188
|
+
│ └─> last_run = previous_job.created_at │
|
|
189
|
+
│ └─> cursor_field = config.cursor_field (e.g., "updated_at") │
|
|
190
|
+
│ │
|
|
191
|
+
│ 3. Calls source.get_records_after(source_state, pagination) │
|
|
192
|
+
│ └─> Source filters: WHERE cursor_field > last_run │
|
|
193
|
+
│ │
|
|
194
|
+
│ 4. Records written to temp table: {table}_incremental │
|
|
195
|
+
│ │
|
|
196
|
+
│ 5. finalize() appends temp table to main table │
|
|
197
|
+
│ └─> INSERT INTO main_table SELECT * FROM temp_table │
|
|
198
|
+
│ └─> Deletes temp table │
|
|
199
|
+
│ │
|
|
200
|
+
│ FIRST RUN: No previous job → falls back to get() (full refresh) │
|
|
201
|
+
│ │
|
|
202
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
#### Configuration Options
|
|
206
|
+
|
|
207
|
+
| Option | Required | Description | Example |
|
|
208
|
+
|--------|----------|-------------|---------|
|
|
209
|
+
| `sync_mode` | Yes | Set to `incremental` | `incremental` |
|
|
210
|
+
| `cursor_field` | Yes | Timestamp field to filter by | `updated_at`, `last_edited_time`, `modified_at` |
|
|
211
|
+
|
|
212
|
+
#### Supported Sources
|
|
213
|
+
|
|
214
|
+
Sources must implement `get_records_after()` to support incremental sync:
|
|
215
|
+
|
|
216
|
+
| Source | Cursor Field | Notes |
|
|
217
|
+
|--------|--------------|-------|
|
|
218
|
+
| `notion` | `last_edited_time` | Supports `pages`, `databases`, `blocks`, `blocks_markdown` streams |
|
|
219
|
+
| (others) | Varies | Check source docs or implement `get_records_after()` |
|
|
220
|
+
|
|
221
|
+
#### Supported Destinations
|
|
222
|
+
|
|
223
|
+
Destinations must implement `finalize()` with incremental logic:
|
|
224
|
+
|
|
225
|
+
| Destination | Support | Notes |
|
|
226
|
+
|-------------|---------|-------|
|
|
227
|
+
| `bigquery` | ✅ | Append-only via temp table |
|
|
228
|
+
| `bigquery_streaming_v2` | ✅ | Append-only via temp table |
|
|
229
|
+
| `file` | ✅ | Appends to existing file |
|
|
230
|
+
| `logger` | ✅ | Logs completion |
|
|
231
|
+
|
|
232
|
+
#### Example: Notion Incremental Sync
|
|
233
|
+
|
|
234
|
+
```yaml
|
|
235
|
+
name: notion_incremental_sync
|
|
236
|
+
|
|
237
|
+
source:
|
|
238
|
+
name: notion
|
|
239
|
+
stream: blocks_markdown
|
|
240
|
+
sync_mode: incremental
|
|
241
|
+
cursor_field: last_edited_time
|
|
242
|
+
authentication:
|
|
243
|
+
type: api_key
|
|
244
|
+
params:
|
|
245
|
+
token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
|
246
|
+
|
|
247
|
+
database_ids:
|
|
248
|
+
- "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
249
|
+
|
|
250
|
+
# Optional: filter which pages to sync
|
|
251
|
+
database_filters:
|
|
252
|
+
"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx":
|
|
253
|
+
property: "Status"
|
|
254
|
+
select:
|
|
255
|
+
equals: "Published"
|
|
256
|
+
|
|
257
|
+
destination:
|
|
258
|
+
name: bigquery
|
|
259
|
+
config:
|
|
260
|
+
project_id: my-gcp-project
|
|
261
|
+
dataset_id: notion_data
|
|
262
|
+
dataset_location: US
|
|
263
|
+
|
|
264
|
+
engine:
|
|
265
|
+
backend:
|
|
266
|
+
type: bigquery
|
|
267
|
+
database: my-gcp-project
|
|
268
|
+
schema: bizon_backend
|
|
269
|
+
syncCursorInDBEvery: 2
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
#### First Run Behavior
|
|
273
|
+
|
|
274
|
+
On the first incremental run (no previous successful job):
|
|
275
|
+
- Falls back to `get()` method (full refresh behavior)
|
|
276
|
+
- All data is fetched and loaded
|
|
277
|
+
- Job is marked as successful
|
|
278
|
+
- Subsequent runs use `get_records_after()` with `last_run` timestamp
|
|
279
|
+
|
|
156
280
|
## Start syncing your data 🚀
|
|
157
281
|
|
|
158
282
|
### Quick setup without any dependencies ✌️
|
|
@@ -13,21 +13,24 @@ bizon/common/models.py,sha256=eL_Ii0CkeJFIjak1CKrB74mbC3OkmWP2uI27ynlYgkQ,10070
|
|
|
13
13
|
bizon/common/errors/backoff.py,sha256=z7RkQt1Npdh0sfD3hBDaiWQKe4iqS6ewvT1Q4Fds5aU,508
|
|
14
14
|
bizon/common/errors/errors.py,sha256=mrYx1uE2kOuR2pEaB7ztK1l2m0E4V-_-hxq-DuILerY,682
|
|
15
15
|
bizon/connectors/destinations/bigquery/config/bigquery.example.yml,sha256=sy5-Piew00BlcjX5CFayFVrUq9G_vFYWXDmpWi9beTY,1263
|
|
16
|
+
bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml,sha256=z0pz4W1x0dlsoAjorYR2DxMjkzTvIWn9tigqtOR8PUY,1076
|
|
16
17
|
bizon/connectors/destinations/bigquery/src/config.py,sha256=q55zR_9V5-ZZmOmSK7fDOHSzzYhoT-fwlppDzX4he9U,4000
|
|
17
|
-
bizon/connectors/destinations/bigquery/src/destination.py,sha256=
|
|
18
|
+
bizon/connectors/destinations/bigquery/src/destination.py,sha256=awS3dZsSKqLTVnhBKuP_9rXSt3IpGv3c4WjZOCwqu9o,9888
|
|
18
19
|
bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml,sha256=rF0mQ5IaOe6oqsbVy6q0innn7SXsOoBdBvIN8BTwPVc,1869
|
|
19
20
|
bizon/connectors/destinations/bigquery_streaming/src/config.py,sha256=LdBKEqHPaGll8PW6c6q_lH7PJvsGdtv2BCrtB-TukTA,1898
|
|
20
|
-
bizon/connectors/destinations/bigquery_streaming/src/destination.py,sha256=
|
|
21
|
+
bizon/connectors/destinations/bigquery_streaming/src/destination.py,sha256=Uyne57NoT-z9uk7Yi4EgOUFYQ4QlvXDLFxgZC5KyCFE,14222
|
|
21
22
|
bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml,sha256=hIQXlXtiBT8DgMVAs0x_h-19xoLkjHr-Ko7oSn8jnc0,2023
|
|
22
23
|
bizon/connectors/destinations/bigquery_streaming_v2/src/config.py,sha256=cdHST5Vx1VQbLsIVsPkoEtOJKmbA35XjsKzj6fZ5DHw,1907
|
|
23
|
-
bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py,sha256=
|
|
24
|
+
bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py,sha256=5aXEsbzyWKzS2F1pFMZ8pdbJaXmdGTaIrwgl2cd1IbU,19026
|
|
24
25
|
bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py,sha256=aWYVzMPMTgsdDapYniu8h6Tf2Pty4fDisT_33d9yEJ4,3692
|
|
25
26
|
bizon/connectors/destinations/file/config/file.example.yml,sha256=sMeX92hTrTQUrLmQgQFsq5OdG5Dk3BbpDo0NhRbBahI,986
|
|
27
|
+
bizon/connectors/destinations/file/config/file_incremental.example.yml,sha256=Xh5KwWiQRuq_MnMgOCHiHqIwHjOjXbwQlVlVcKdXARA,620
|
|
26
28
|
bizon/connectors/destinations/file/src/config.py,sha256=dU64aFe7J63aBGh6Os8mXl2kvECj3s4pPC7H3EmOvb8,585
|
|
27
|
-
bizon/connectors/destinations/file/src/destination.py,sha256=
|
|
29
|
+
bizon/connectors/destinations/file/src/destination.py,sha256=RQEL0Z5l409S319fAJyvW8cDblUCVAxPhALJVhjQKDM,4253
|
|
28
30
|
bizon/connectors/destinations/logger/config/logger.example.yml,sha256=KtQRmqqFeziJtBZ7vzrXGQLdTgWZNjxx2sdFXpIgIp4,672
|
|
31
|
+
bizon/connectors/destinations/logger/config/logger_incremental.example.yml,sha256=rwTLlXib-Jo3b4-_NcFv2ShdPC73WEpiiX3apP3sKg0,541
|
|
29
32
|
bizon/connectors/destinations/logger/src/config.py,sha256=vIV_G0k9c8DPcDxU6CGvEOL2zAEvAmKZcx3RV0eRi7A,426
|
|
30
|
-
bizon/connectors/destinations/logger/src/destination.py,sha256
|
|
33
|
+
bizon/connectors/destinations/logger/src/destination.py,sha256=YUC_lAN5nrcrNAN90hnalKFAKX49KTDlJwdLfwTaC0U,2007
|
|
31
34
|
bizon/connectors/sources/cycle/config/cycle.example.yml,sha256=UDiqOa-8ZsykmNT625kxq9tyXOj_gKe9CFwg9r_8SYk,230
|
|
32
35
|
bizon/connectors/sources/cycle/src/source.py,sha256=6sXMneq59XZAT5oJseM9k6sGJaoQw4NDp8FTtg8lPhk,4213
|
|
33
36
|
bizon/connectors/sources/cycle/tests/cycle_customers.py,sha256=A48S20LxIC0A74JLoFn4NTHNTgBWV_5stTFtF1Gfk2c,271
|
|
@@ -43,9 +46,11 @@ bizon/connectors/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py,sha25
|
|
|
43
46
|
bizon/connectors/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py,sha256=PFUhDuFw1Q1AMNMsnXPQxoqHIWf_wHEL1hLQodYlLcQ,596
|
|
44
47
|
bizon/connectors/sources/gsheets/config/default_auth.example.yml,sha256=KOBp6MfO4uJwpwEYW0tJ4X5ctVwwdur9poJB4Ohba6s,348
|
|
45
48
|
bizon/connectors/sources/gsheets/config/service_account.example.yml,sha256=XxVUnk9gGWc3lDb8CnzTHjTu8xz4Asyr5tXzY6qLvPg,1081
|
|
49
|
+
bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml,sha256=WGvAtw4aOwSMWrSZW0tHaRncZnGbI6gd4LJk1aHIP_c,1765
|
|
46
50
|
bizon/connectors/sources/gsheets/src/source.py,sha256=xNF5FR9QLTM4kCiZ2eKZ5CZWNhLw6tyLaJZbliNzYnY,5675
|
|
47
51
|
bizon/connectors/sources/gsheets/tests/gsheets_pipeline.py,sha256=lNSM3kZTd4W_-ajGIO3mdp8qGdEbnmWqsMm5pRiS0cw,181
|
|
48
52
|
bizon/connectors/sources/hubspot/config/api_key.example.yml,sha256=VDTRloE5caqAdGdXgvsJZ6nQT46JHzX_YboxeGbpP18,389
|
|
53
|
+
bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml,sha256=g4SBeVEXSr3tCgy5VjgZPWkhnuvEZ0jl5nPNn3u05Jc,920
|
|
49
54
|
bizon/connectors/sources/hubspot/config/oauth.example.yml,sha256=YqBtj1IxIsdM9E85_4eVWl6mPiHsQNoQn41EzCqORy0,499
|
|
50
55
|
bizon/connectors/sources/hubspot/src/hubspot_base.py,sha256=THo8ImrPrIxeTuFcBMRJYwaDMstIfLIGjrQLE2cqqsU,3424
|
|
51
56
|
bizon/connectors/sources/hubspot/src/hubspot_objects.py,sha256=ykqvxaFihv0e0A3-gGDmentp1KCGCoYvvDwZ3CcHzNg,6301
|
|
@@ -60,9 +65,10 @@ bizon/connectors/sources/kafka/src/decode.py,sha256=RhPjazRQHb72D9iBhb763Nje7SH9
|
|
|
60
65
|
bizon/connectors/sources/kafka/src/source.py,sha256=0Hv6viyVZGAd4azhQnqCteyHuwsbbDL4rSGEjMCff9E,19722
|
|
61
66
|
bizon/connectors/sources/kafka/tests/kafka_pipeline.py,sha256=9LaCqXJIEx2ye3dkWq0YK_bPX7d4fCX_OcDOJCk34WE,206
|
|
62
67
|
bizon/connectors/sources/notion/config/api_key.example.yml,sha256=TagqOqaho4u_G5ZP4L8je89Y4G_NvCo8s4Wf9e8yVH8,1061
|
|
68
|
+
bizon/connectors/sources/notion/config/api_key_incremental.example.yml,sha256=52uQJo-SrqFny00zIVbA86qVq3asYHMFALqBcdmPmc8,1499
|
|
63
69
|
bizon/connectors/sources/notion/src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
70
|
bizon/connectors/sources/notion/src/config.py,sha256=L-FZWijUa-aWK9VenWGsl6mv40i4ww46FacjYoX9gXo,1886
|
|
65
|
-
bizon/connectors/sources/notion/src/source.py,sha256=
|
|
71
|
+
bizon/connectors/sources/notion/src/source.py,sha256=aViwfLuBzsNGZHwU4-z-xI40cROJTvx7Tlkw3ApF3q8,66217
|
|
66
72
|
bizon/connectors/sources/notion/tests/notion_pipeline.py,sha256=lyiD9b5uUF3oih8vY4gk7QXnfySGSawnbrBuSdTLym8,200
|
|
67
73
|
bizon/connectors/sources/notion/tests/test_notion.py,sha256=-G0DbTLDS2Gc_Bx8xR2VXnY89vW64s1-puwPc9x2N7A,4029
|
|
68
74
|
bizon/connectors/sources/periscope/config/periscope_charts.example.yml,sha256=9OgFDB7vguiNz2F2fmRqDNV8S_ddO9ncN5hgW9MhME4,350
|
|
@@ -88,7 +94,7 @@ bizon/engine/backend/adapters/sqlalchemy/backend.py,sha256=ipJ7eY_iiqjrvtq4NS39C
|
|
|
88
94
|
bizon/engine/backend/adapters/sqlalchemy/config.py,sha256=CeTWncVK27Y6lEKMVCF5RxD8Illhx2IQqqFkGrf0WKA,1845
|
|
89
95
|
bizon/engine/pipeline/consumer.py,sha256=DtCR3mG791h35poYJdXjL9geNO-GWPKl_YC0zPsF5qI,3207
|
|
90
96
|
bizon/engine/pipeline/models.py,sha256=qOra2MJGN6-PuouKpKuZRjutnQmzom0mgWDFZ16LcM8,405
|
|
91
|
-
bizon/engine/pipeline/producer.py,sha256=
|
|
97
|
+
bizon/engine/pipeline/producer.py,sha256=XV2fR6CNMRlbYwqTl9mlqy6nkG37ODyh2aiiTZ371VM,11995
|
|
92
98
|
bizon/engine/queue/config.py,sha256=0XwiQSB2OKTs-rODCSZqT5txNZzGOic2-PvODbcSrGg,1267
|
|
93
99
|
bizon/engine/queue/queue.py,sha256=Y9uj31d-ZgW2f0F02iccp_o-m-RoMm_jR61NkLdMQ2M,3461
|
|
94
100
|
bizon/engine/queue/adapters/kafka/config.py,sha256=ndNEXRT-nIgyWgoqlNXFhmlN206v87GobXIW9Z0zrSA,1085
|
|
@@ -113,10 +119,10 @@ bizon/monitoring/datadog/monitor.py,sha256=YSdyMVEIjkDyp91_mGED_kx8j76MbQyQGkGJC
|
|
|
113
119
|
bizon/monitoring/noop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
120
|
bizon/monitoring/noop/monitor.py,sha256=Pu7Qt9SpUG1UvC8aWysgtoDY-t5tnKd4FlUXAC4MjbI,1066
|
|
115
121
|
bizon/source/callback.py,sha256=lfTwU_bzJwR0q5sbiKoK8uedQ-dhfHzoYkPVqm8b_Ho,602
|
|
116
|
-
bizon/source/config.py,sha256=
|
|
122
|
+
bizon/source/config.py,sha256=JyZbKjlU0xhiyuuIGJYJPGUl9JxS4xyGeCyHoHgHHos,2473
|
|
117
123
|
bizon/source/cursor.py,sha256=Wjh9eNEiHV5P9YnjS5bdS2ahyFc0gPm9QLQtD-QjQCI,4089
|
|
118
124
|
bizon/source/discover.py,sha256=h9IVqtAQsTH-XxR-UkAFgNvEphLP2LgataQCCuHbGrk,11174
|
|
119
|
-
bizon/source/models.py,sha256=
|
|
125
|
+
bizon/source/models.py,sha256=CHPKvO9chRi85WPDfLYy9vWnPsua8LTwYvjjN7Dj2uA,1837
|
|
120
126
|
bizon/source/session.py,sha256=klbCv0g6sm6ac-pzM50eAJSP8DdQ9DOegHgjpmKKUrI,1978
|
|
121
127
|
bizon/source/source.py,sha256=k_fHOOvam5ixZ9oPuQzUa9Kq3jVvv2HY7ghrCo-0o3I,4342
|
|
122
128
|
bizon/source/auth/builder.py,sha256=hc4zBNj31LZc-QqgIyx1VQEYTm9Xv81vY5pJiwQroJo,860
|
|
@@ -129,8 +135,8 @@ bizon/source/auth/authenticators/oauth.py,sha256=tY_UZsWTy4FkifqJ7-smPaD61gg1dMJ
|
|
|
129
135
|
bizon/source/auth/authenticators/token.py,sha256=P6SKRAarAEv28YiWp8hQLSKAV7twNlyNTGRr9sxlx58,956
|
|
130
136
|
bizon/transform/config.py,sha256=Q9F7jlsuaXK8OYrO5qcdk8lxXTDoIgzoVMhhHW3igEw,213
|
|
131
137
|
bizon/transform/transform.py,sha256=Ufla8YFx9C9WEiN0ppmZS1a86Sk0PgggqC-8DIvDeAQ,1414
|
|
132
|
-
bizon-0.
|
|
133
|
-
bizon-0.
|
|
134
|
-
bizon-0.
|
|
135
|
-
bizon-0.
|
|
136
|
-
bizon-0.
|
|
138
|
+
bizon-0.3.0.dist-info/METADATA,sha256=oX7OZjHhKAVvQ8UiRS0ksqu3C65t2kOp2mAfXoEBdJY,11159
|
|
139
|
+
bizon-0.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
140
|
+
bizon-0.3.0.dist-info/entry_points.txt,sha256=hHZPN-V6JwwhSYWNCKVu3WNxekuhXtIAaz_zdwO7NDo,45
|
|
141
|
+
bizon-0.3.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
142
|
+
bizon-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|