bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +184 -4
- bizon/connectors/destinations/bigquery/src/config.py +1 -1
- bizon/connectors/destinations/bigquery/src/destination.py +14 -9
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
- bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +2 -1
- bizon/connectors/destinations/file/src/destination.py +3 -6
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +1 -2
- bizon/connectors/destinations/logger/src/destination.py +4 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -12
- bizon/connectors/sources/kafka/src/decode.py +65 -60
- bizon/connectors/sources/kafka/src/source.py +182 -61
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +9 -1
- bizon/destination/destination.py +38 -9
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +114 -42
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +14 -9
- bizon/monitoring/config.py +12 -2
- bizon/monitoring/datadog/monitor.py +100 -14
- bizon/monitoring/monitor.py +41 -12
- bizon/monitoring/noop/monitor.py +22 -3
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +18 -3
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.1.dist-info/RECORD +0 -123
- bizon-0.1.1.dist-info/entry_points.txt +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,1159 @@
|
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
|
+
from typing import Any, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from requests import Session
|
|
6
|
+
from requests.adapters import HTTPAdapter
|
|
7
|
+
from requests.auth import AuthBase
|
|
8
|
+
from urllib3.util.retry import Retry
|
|
9
|
+
|
|
10
|
+
from bizon.source.auth.builder import AuthBuilder
|
|
11
|
+
from bizon.source.auth.config import AuthType
|
|
12
|
+
from bizon.source.config import SourceConfig
|
|
13
|
+
from bizon.source.models import SourceIteration, SourceRecord
|
|
14
|
+
from bizon.source.source import AbstractSource
|
|
15
|
+
|
|
16
|
+
from .config import NotionSourceConfig, NotionStreams
|
|
17
|
+
|
|
18
|
+
NOTION_API_VERSION = "2025-09-03"
|
|
19
|
+
BASE_URL = "https://api.notion.com/v1"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class NotionSource(AbstractSource):
|
|
23
|
+
def __init__(self, config: NotionSourceConfig):
|
|
24
|
+
super().__init__(config)
|
|
25
|
+
self.config: NotionSourceConfig = config
|
|
26
|
+
|
|
27
|
+
def get_session(self) -> Session:
|
|
28
|
+
"""Create a session with retry logic and required Notion headers."""
|
|
29
|
+
session = Session()
|
|
30
|
+
retries = Retry(
|
|
31
|
+
total=10,
|
|
32
|
+
backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32... seconds
|
|
33
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
34
|
+
allowed_methods=["GET", "POST", "PATCH"], # Retry on POST/PATCH too
|
|
35
|
+
respect_retry_after_header=True, # Honor Notion's Retry-After header
|
|
36
|
+
)
|
|
37
|
+
session.mount("https://", HTTPAdapter(max_retries=retries))
|
|
38
|
+
session.headers.update(
|
|
39
|
+
{
|
|
40
|
+
"Notion-Version": NOTION_API_VERSION,
|
|
41
|
+
"Content-Type": "application/json",
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
return session
|
|
45
|
+
|
|
46
|
+
def get_authenticator(self) -> AuthBase:
|
|
47
|
+
if self.config.authentication.type.value in [AuthType.API_KEY, AuthType.BEARER]:
|
|
48
|
+
return AuthBuilder.token(params=self.config.authentication.params)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def streams() -> List[str]:
|
|
53
|
+
return [item.value for item in NotionStreams]
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def get_config_class() -> SourceConfig:
|
|
57
|
+
return NotionSourceConfig
|
|
58
|
+
|
|
59
|
+
def check_connection(self) -> Tuple[bool, Optional[Any]]:
|
|
60
|
+
"""Test connection by listing users."""
|
|
61
|
+
try:
|
|
62
|
+
response = self.session.get(f"{BASE_URL}/users")
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
return True, None
|
|
65
|
+
except Exception as e:
|
|
66
|
+
return False, str(e)
|
|
67
|
+
|
|
68
|
+
def get_total_records_count(self) -> Optional[int]:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
# ==================== USERS STREAM ====================
|
|
72
|
+
|
|
73
|
+
def get_users(self, pagination: dict = None) -> SourceIteration:
|
|
74
|
+
"""Fetch all users accessible to the integration."""
|
|
75
|
+
params = {"page_size": self.config.page_size}
|
|
76
|
+
if pagination and pagination.get("start_cursor"):
|
|
77
|
+
params["start_cursor"] = pagination["start_cursor"]
|
|
78
|
+
|
|
79
|
+
response = self.session.get(f"{BASE_URL}/users", params=params)
|
|
80
|
+
response.raise_for_status()
|
|
81
|
+
data = response.json()
|
|
82
|
+
|
|
83
|
+
records = [SourceRecord(id=user["id"], data=user) for user in data.get("results", [])]
|
|
84
|
+
|
|
85
|
+
next_pagination = {"start_cursor": data.get("next_cursor")} if data.get("has_more") else {}
|
|
86
|
+
|
|
87
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
88
|
+
|
|
89
|
+
# ==================== DATABASES STREAM ====================
|
|
90
|
+
|
|
91
|
+
def get_database(self, database_id: str) -> dict:
|
|
92
|
+
"""Fetch a single database by ID."""
|
|
93
|
+
response = self.session.get(f"{BASE_URL}/databases/{database_id}")
|
|
94
|
+
response.raise_for_status()
|
|
95
|
+
return response.json()
|
|
96
|
+
|
|
97
|
+
def get_databases(self, pagination: dict = None) -> SourceIteration:
|
|
98
|
+
"""Fetch databases for the configured database_ids."""
|
|
99
|
+
if not self.config.database_ids:
|
|
100
|
+
logger.warning("No database_ids configured, returning empty results")
|
|
101
|
+
return SourceIteration(records=[], next_pagination={})
|
|
102
|
+
|
|
103
|
+
# Track progress through database_ids list
|
|
104
|
+
if pagination:
|
|
105
|
+
remaining_ids = pagination.get("remaining_ids", [])
|
|
106
|
+
else:
|
|
107
|
+
remaining_ids = list(self.config.database_ids)
|
|
108
|
+
|
|
109
|
+
if not remaining_ids:
|
|
110
|
+
return SourceIteration(records=[], next_pagination={})
|
|
111
|
+
|
|
112
|
+
# Process one database at a time
|
|
113
|
+
database_id = remaining_ids[0]
|
|
114
|
+
remaining_ids = remaining_ids[1:]
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
database_data = self.get_database(database_id)
|
|
118
|
+
records = [SourceRecord(id=database_data["id"], data=database_data)]
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Failed to fetch database {database_id}: {e}")
|
|
121
|
+
records = []
|
|
122
|
+
|
|
123
|
+
next_pagination = {"remaining_ids": remaining_ids} if remaining_ids else {}
|
|
124
|
+
|
|
125
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
126
|
+
|
|
127
|
+
# ==================== DATA SOURCES STREAM ====================
|
|
128
|
+
|
|
129
|
+
def get_data_sources(self, pagination: dict = None) -> SourceIteration:
|
|
130
|
+
"""
|
|
131
|
+
Fetch data sources from databases.
|
|
132
|
+
In the 2025-09-03 API, databases contain a data_sources array.
|
|
133
|
+
"""
|
|
134
|
+
if not self.config.database_ids:
|
|
135
|
+
logger.warning("No database_ids configured, returning empty results")
|
|
136
|
+
return SourceIteration(records=[], next_pagination={})
|
|
137
|
+
|
|
138
|
+
if pagination:
|
|
139
|
+
remaining_ids = pagination.get("remaining_ids", [])
|
|
140
|
+
else:
|
|
141
|
+
remaining_ids = list(self.config.database_ids)
|
|
142
|
+
|
|
143
|
+
if not remaining_ids:
|
|
144
|
+
return SourceIteration(records=[], next_pagination={})
|
|
145
|
+
|
|
146
|
+
database_id = remaining_ids[0]
|
|
147
|
+
remaining_ids = remaining_ids[1:]
|
|
148
|
+
|
|
149
|
+
records = []
|
|
150
|
+
try:
|
|
151
|
+
database_data = self.get_database(database_id)
|
|
152
|
+
data_sources = database_data.get("data_sources", [])
|
|
153
|
+
|
|
154
|
+
for ds in data_sources:
|
|
155
|
+
# Enrich data source with parent database info
|
|
156
|
+
ds_record = {
|
|
157
|
+
**ds,
|
|
158
|
+
"parent_database_id": database_id,
|
|
159
|
+
"parent_database_title": self._extract_title(database_data),
|
|
160
|
+
}
|
|
161
|
+
records.append(SourceRecord(id=ds["id"], data=ds_record))
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Failed to fetch data sources from database {database_id}: {e}")
|
|
165
|
+
|
|
166
|
+
next_pagination = {"remaining_ids": remaining_ids} if remaining_ids else {}
|
|
167
|
+
|
|
168
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
169
|
+
|
|
170
|
+
# ==================== PAGES STREAM ====================
|
|
171
|
+
|
|
172
|
+
def query_data_source(self, data_source_id: str, start_cursor: str = None, filter: dict = None) -> dict:
|
|
173
|
+
"""Query a data source for its pages."""
|
|
174
|
+
payload = {"page_size": self.config.page_size}
|
|
175
|
+
if start_cursor:
|
|
176
|
+
payload["start_cursor"] = start_cursor
|
|
177
|
+
if filter:
|
|
178
|
+
payload["filter"] = filter
|
|
179
|
+
|
|
180
|
+
response = self.session.post(f"{BASE_URL}/data_sources/{data_source_id}/query", json=payload)
|
|
181
|
+
response.raise_for_status()
|
|
182
|
+
return response.json()
|
|
183
|
+
|
|
184
|
+
def get_page(self, page_id: str) -> dict:
|
|
185
|
+
"""Fetch a single page by ID."""
|
|
186
|
+
response = self.session.get(f"{BASE_URL}/pages/{page_id}")
|
|
187
|
+
response.raise_for_status()
|
|
188
|
+
return response.json()
|
|
189
|
+
|
|
190
|
+
def get_pages(self, pagination: dict = None) -> SourceIteration:
|
|
191
|
+
"""
|
|
192
|
+
Fetch pages from data sources (querying databases) and/or specific page IDs.
|
|
193
|
+
"""
|
|
194
|
+
records = []
|
|
195
|
+
|
|
196
|
+
if pagination:
|
|
197
|
+
# Continue previous pagination state
|
|
198
|
+
# remaining_data_sources is list of {"ds_id": str, "db_id": str}
|
|
199
|
+
remaining_data_sources = pagination.get("remaining_data_sources", [])
|
|
200
|
+
current_data_source = pagination.get("current_data_source") # {"ds_id": str, "db_id": str}
|
|
201
|
+
data_source_cursor = pagination.get("data_source_cursor")
|
|
202
|
+
remaining_page_ids = pagination.get("remaining_page_ids", [])
|
|
203
|
+
data_sources_loaded = pagination.get("data_sources_loaded", False)
|
|
204
|
+
else:
|
|
205
|
+
remaining_data_sources = []
|
|
206
|
+
current_data_source = None
|
|
207
|
+
data_source_cursor = None
|
|
208
|
+
remaining_page_ids = list(self.config.page_ids)
|
|
209
|
+
data_sources_loaded = False
|
|
210
|
+
|
|
211
|
+
# First, load all data sources from databases if not done
|
|
212
|
+
if not data_sources_loaded and self.config.database_ids:
|
|
213
|
+
for db_id in self.config.database_ids:
|
|
214
|
+
try:
|
|
215
|
+
db_data = self.get_database(db_id)
|
|
216
|
+
for ds in db_data.get("data_sources", []):
|
|
217
|
+
remaining_data_sources.append({"ds_id": ds["id"], "db_id": db_id})
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Failed to get data sources from database {db_id}: {e}")
|
|
220
|
+
data_sources_loaded = True
|
|
221
|
+
|
|
222
|
+
# Process current data source if we have one with a cursor
|
|
223
|
+
if current_data_source and data_source_cursor:
|
|
224
|
+
try:
|
|
225
|
+
ds_filter = self.get_filter_for_database(current_data_source["db_id"])
|
|
226
|
+
result = self.query_data_source(current_data_source["ds_id"], data_source_cursor, filter=ds_filter)
|
|
227
|
+
for page in result.get("results", []):
|
|
228
|
+
records.append(SourceRecord(id=page["id"], data=page))
|
|
229
|
+
|
|
230
|
+
if result.get("has_more"):
|
|
231
|
+
return SourceIteration(
|
|
232
|
+
records=records,
|
|
233
|
+
next_pagination={
|
|
234
|
+
"remaining_data_sources": remaining_data_sources,
|
|
235
|
+
"current_data_source": current_data_source,
|
|
236
|
+
"data_source_cursor": result.get("next_cursor"),
|
|
237
|
+
"remaining_page_ids": remaining_page_ids,
|
|
238
|
+
"data_sources_loaded": True,
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.error(f"Failed to query data source {current_data_source['ds_id']}: {e}")
|
|
243
|
+
|
|
244
|
+
# Process next data source
|
|
245
|
+
if remaining_data_sources:
|
|
246
|
+
ds_info = remaining_data_sources[0]
|
|
247
|
+
remaining_data_sources = remaining_data_sources[1:]
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
ds_filter = self.get_filter_for_database(ds_info["db_id"])
|
|
251
|
+
result = self.query_data_source(ds_info["ds_id"], filter=ds_filter)
|
|
252
|
+
for page in result.get("results", []):
|
|
253
|
+
records.append(SourceRecord(id=page["id"], data=page))
|
|
254
|
+
|
|
255
|
+
if result.get("has_more"):
|
|
256
|
+
return SourceIteration(
|
|
257
|
+
records=records,
|
|
258
|
+
next_pagination={
|
|
259
|
+
"remaining_data_sources": remaining_data_sources,
|
|
260
|
+
"current_data_source": ds_info,
|
|
261
|
+
"data_source_cursor": result.get("next_cursor"),
|
|
262
|
+
"remaining_page_ids": remaining_page_ids,
|
|
263
|
+
"data_sources_loaded": True,
|
|
264
|
+
},
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# If there are more data sources, continue
|
|
268
|
+
if remaining_data_sources:
|
|
269
|
+
return SourceIteration(
|
|
270
|
+
records=records,
|
|
271
|
+
next_pagination={
|
|
272
|
+
"remaining_data_sources": remaining_data_sources,
|
|
273
|
+
"current_data_source": None,
|
|
274
|
+
"data_source_cursor": None,
|
|
275
|
+
"remaining_page_ids": remaining_page_ids,
|
|
276
|
+
"data_sources_loaded": True,
|
|
277
|
+
},
|
|
278
|
+
)
|
|
279
|
+
except Exception as e:
|
|
280
|
+
logger.error(f"Failed to query data source {ds_info['ds_id']}: {e}")
|
|
281
|
+
# Continue with remaining data sources
|
|
282
|
+
if remaining_data_sources:
|
|
283
|
+
return SourceIteration(
|
|
284
|
+
records=records,
|
|
285
|
+
next_pagination={
|
|
286
|
+
"remaining_data_sources": remaining_data_sources,
|
|
287
|
+
"current_data_source": None,
|
|
288
|
+
"data_source_cursor": None,
|
|
289
|
+
"remaining_page_ids": remaining_page_ids,
|
|
290
|
+
"data_sources_loaded": True,
|
|
291
|
+
},
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Process individual page IDs
|
|
295
|
+
if remaining_page_ids:
|
|
296
|
+
page_id = remaining_page_ids[0]
|
|
297
|
+
remaining_page_ids = remaining_page_ids[1:]
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
page_data = self.get_page(page_id)
|
|
301
|
+
records.append(SourceRecord(id=page_data["id"], data=page_data))
|
|
302
|
+
except Exception as e:
|
|
303
|
+
logger.error(f"Failed to fetch page {page_id}: {e}")
|
|
304
|
+
|
|
305
|
+
if remaining_page_ids:
|
|
306
|
+
return SourceIteration(
|
|
307
|
+
records=records,
|
|
308
|
+
next_pagination={
|
|
309
|
+
"remaining_data_sources": [],
|
|
310
|
+
"current_data_source": None,
|
|
311
|
+
"data_source_cursor": None,
|
|
312
|
+
"remaining_page_ids": remaining_page_ids,
|
|
313
|
+
"data_sources_loaded": True,
|
|
314
|
+
},
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
return SourceIteration(records=records, next_pagination={})
|
|
318
|
+
|
|
319
|
+
# ==================== BLOCKS STREAM ====================
|
|
320
|
+
|
|
321
|
+
def get_block_children(self, block_id: str, start_cursor: str = None) -> dict:
|
|
322
|
+
"""Fetch children blocks of a block."""
|
|
323
|
+
params = {"page_size": self.config.page_size}
|
|
324
|
+
if start_cursor:
|
|
325
|
+
params["start_cursor"] = start_cursor
|
|
326
|
+
|
|
327
|
+
response = self.session.get(f"{BASE_URL}/blocks/{block_id}/children", params=params)
|
|
328
|
+
response.raise_for_status()
|
|
329
|
+
return response.json()
|
|
330
|
+
|
|
331
|
+
def get_pages_from_database(self, database_id: str, apply_filter: bool = False) -> List[str]:
|
|
332
|
+
"""Get all page IDs from a database by querying its data sources.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
database_id: The database ID to fetch pages from
|
|
336
|
+
apply_filter: Whether to apply database_filters config (False for inline databases)
|
|
337
|
+
"""
|
|
338
|
+
page_ids = []
|
|
339
|
+
db_filter = self.get_filter_for_database(database_id) if apply_filter else None
|
|
340
|
+
try:
|
|
341
|
+
db_data = self.get_database(database_id)
|
|
342
|
+
if not db_data:
|
|
343
|
+
return page_ids
|
|
344
|
+
for ds in db_data.get("data_sources") or []:
|
|
345
|
+
cursor = None
|
|
346
|
+
while True:
|
|
347
|
+
result = self.query_data_source(ds["id"], cursor, filter=db_filter)
|
|
348
|
+
if not result:
|
|
349
|
+
break
|
|
350
|
+
for page in result.get("results") or []:
|
|
351
|
+
if page and page.get("id"):
|
|
352
|
+
page_ids.append(page["id"])
|
|
353
|
+
if result.get("has_more"):
|
|
354
|
+
cursor = result.get("next_cursor")
|
|
355
|
+
else:
|
|
356
|
+
break
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.error(f"Failed to get pages from database {database_id}: {e}")
|
|
359
|
+
return page_ids
|
|
360
|
+
|
|
361
|
+
def fetch_blocks_recursively(
|
|
362
|
+
self,
|
|
363
|
+
block_id: str,
|
|
364
|
+
parent_input_database_id: Optional[str] = None,
|
|
365
|
+
parent_input_page_id: Optional[str] = None,
|
|
366
|
+
source_page_id: Optional[str] = None,
|
|
367
|
+
current_depth: int = 0,
|
|
368
|
+
fetch_child_databases: bool = True,
|
|
369
|
+
global_order_counter: Optional[List[int]] = None,
|
|
370
|
+
) -> List[dict]:
|
|
371
|
+
"""
|
|
372
|
+
Fetch all blocks under a block_id recursively.
|
|
373
|
+
Also fetches content from child_database blocks.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
block_id: The block/page ID to fetch children from
|
|
377
|
+
parent_input_database_id: The original input database ID from config
|
|
378
|
+
parent_input_page_id: The original input page ID from config
|
|
379
|
+
source_page_id: The immediate page this block belongs to
|
|
380
|
+
current_depth: Current recursion depth (0 = top level)
|
|
381
|
+
fetch_child_databases: Whether to recurse into child_database blocks (disable for all_* streams)
|
|
382
|
+
global_order_counter: Mutable counter [int] for tracking reading order across all blocks in a page
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Flat list of all blocks with lineage tracking fields
|
|
386
|
+
"""
|
|
387
|
+
# Initialize counter on first call
|
|
388
|
+
if global_order_counter is None:
|
|
389
|
+
global_order_counter = [0]
|
|
390
|
+
# Check recursion depth limit
|
|
391
|
+
if current_depth >= self.config.max_recursion_depth:
|
|
392
|
+
logger.warning(
|
|
393
|
+
f"Max recursion depth {self.config.max_recursion_depth} reached for block {block_id}, stopping recursion"
|
|
394
|
+
)
|
|
395
|
+
return []
|
|
396
|
+
|
|
397
|
+
all_blocks = []
|
|
398
|
+
cursor = None
|
|
399
|
+
block_order = 0 # Track position within parent
|
|
400
|
+
|
|
401
|
+
while True:
|
|
402
|
+
result = self.get_block_children(block_id, cursor)
|
|
403
|
+
if not result:
|
|
404
|
+
break
|
|
405
|
+
|
|
406
|
+
for block in result.get("results") or []:
|
|
407
|
+
if not block:
|
|
408
|
+
continue
|
|
409
|
+
# Add lineage tracking
|
|
410
|
+
block["parent_block_id"] = block_id
|
|
411
|
+
block["parent_input_database_id"] = parent_input_database_id
|
|
412
|
+
block["parent_input_page_id"] = parent_input_page_id
|
|
413
|
+
block["source_page_id"] = source_page_id
|
|
414
|
+
# Add depth and ordering
|
|
415
|
+
block["depth"] = current_depth
|
|
416
|
+
block["block_order"] = block_order
|
|
417
|
+
block["page_order"] = global_order_counter[0]
|
|
418
|
+
block_order += 1
|
|
419
|
+
global_order_counter[0] += 1
|
|
420
|
+
|
|
421
|
+
all_blocks.append(block)
|
|
422
|
+
|
|
423
|
+
# Handle child_database blocks - fetch their content in parallel
|
|
424
|
+
if (
|
|
425
|
+
block.get("type") == "child_database"
|
|
426
|
+
and self.config.fetch_blocks_recursively
|
|
427
|
+
and fetch_child_databases
|
|
428
|
+
):
|
|
429
|
+
child_db_id = block.get("id")
|
|
430
|
+
logger.info(f"Found inline database {child_db_id} at depth {current_depth}, fetching its content")
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
# Get all pages from the inline database
|
|
434
|
+
inline_page_ids = self.get_pages_from_database(child_db_id)
|
|
435
|
+
|
|
436
|
+
# Fetch blocks from inline pages in parallel
|
|
437
|
+
# Note: parallel execution means global_order_counter won't be sequential for inline DBs
|
|
438
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
439
|
+
futures = {
|
|
440
|
+
executor.submit(
|
|
441
|
+
self.fetch_blocks_recursively,
|
|
442
|
+
block_id=inline_page_id,
|
|
443
|
+
parent_input_database_id=parent_input_database_id,
|
|
444
|
+
parent_input_page_id=parent_input_page_id,
|
|
445
|
+
source_page_id=inline_page_id,
|
|
446
|
+
current_depth=current_depth + 1,
|
|
447
|
+
fetch_child_databases=fetch_child_databases,
|
|
448
|
+
global_order_counter=global_order_counter,
|
|
449
|
+
): inline_page_id
|
|
450
|
+
for inline_page_id in inline_page_ids
|
|
451
|
+
}
|
|
452
|
+
for future in as_completed(futures):
|
|
453
|
+
try:
|
|
454
|
+
inline_blocks = future.result()
|
|
455
|
+
all_blocks.extend(inline_blocks)
|
|
456
|
+
except Exception as e:
|
|
457
|
+
page_id = futures[future]
|
|
458
|
+
logger.error(f"Failed to fetch blocks from inline page {page_id}: {e}")
|
|
459
|
+
|
|
460
|
+
except Exception as e:
|
|
461
|
+
logger.error(f"Failed to fetch content from inline database {child_db_id}: {e}")
|
|
462
|
+
|
|
463
|
+
# Recursively fetch children if block has children
|
|
464
|
+
# Skip child_page and child_database - they are references, not containers with inline content
|
|
465
|
+
elif (
|
|
466
|
+
block.get("has_children")
|
|
467
|
+
and self.config.fetch_blocks_recursively
|
|
468
|
+
and block.get("type") not in ("child_page", "child_database")
|
|
469
|
+
):
|
|
470
|
+
try:
|
|
471
|
+
child_blocks = self.fetch_blocks_recursively(
|
|
472
|
+
block_id=block["id"],
|
|
473
|
+
parent_input_database_id=parent_input_database_id,
|
|
474
|
+
parent_input_page_id=parent_input_page_id,
|
|
475
|
+
source_page_id=source_page_id,
|
|
476
|
+
current_depth=current_depth + 1,
|
|
477
|
+
fetch_child_databases=fetch_child_databases,
|
|
478
|
+
global_order_counter=global_order_counter,
|
|
479
|
+
)
|
|
480
|
+
all_blocks.extend(child_blocks)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
# synced_block can return 404 if original block is inaccessible
|
|
483
|
+
if block.get("type") == "synced_block":
|
|
484
|
+
logger.warning(f"Skipping synced_block {block.get('id')} - children inaccessible: {e}")
|
|
485
|
+
else:
|
|
486
|
+
logger.error(f"Failed to fetch children of block {block.get('id')}: {e}")
|
|
487
|
+
|
|
488
|
+
if result.get("has_more"):
|
|
489
|
+
cursor = result.get("next_cursor")
|
|
490
|
+
else:
|
|
491
|
+
break
|
|
492
|
+
|
|
493
|
+
return all_blocks
|
|
494
|
+
|
|
495
|
+
def get_blocks(self, pagination: dict = None) -> SourceIteration:
|
|
496
|
+
"""
|
|
497
|
+
Fetch blocks from databases and pages.
|
|
498
|
+
Blocks are fetched recursively if fetch_blocks_recursively is True.
|
|
499
|
+
Also fetches content from inline databases (child_database blocks).
|
|
500
|
+
"""
|
|
501
|
+
if pagination:
|
|
502
|
+
# Each item is: {"block_id": str, "input_db_id": str|None, "input_page_id": str|None, "source_page_id": str|None}
|
|
503
|
+
items_to_process = pagination.get("items_to_process", [])
|
|
504
|
+
items_loaded = pagination.get("items_loaded", False)
|
|
505
|
+
else:
|
|
506
|
+
items_to_process = []
|
|
507
|
+
items_loaded = False
|
|
508
|
+
|
|
509
|
+
# First, collect all database IDs and page IDs to fetch blocks from
|
|
510
|
+
if not items_loaded:
|
|
511
|
+
# Add configured page_ids (these are direct input pages)
|
|
512
|
+
for page_id in self.config.page_ids:
|
|
513
|
+
items_to_process.append(
|
|
514
|
+
{
|
|
515
|
+
"block_id": page_id,
|
|
516
|
+
"input_db_id": None,
|
|
517
|
+
"input_page_id": page_id,
|
|
518
|
+
"source_page_id": page_id,
|
|
519
|
+
}
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Collect pages from databases
|
|
523
|
+
for db_id in self.config.database_ids:
|
|
524
|
+
try:
|
|
525
|
+
db_filter = self.get_filter_for_database(db_id)
|
|
526
|
+
db_data = self.get_database(db_id)
|
|
527
|
+
for ds in db_data.get("data_sources", []):
|
|
528
|
+
cursor = None
|
|
529
|
+
while True:
|
|
530
|
+
result = self.query_data_source(ds["id"], cursor, filter=db_filter)
|
|
531
|
+
for page in result.get("results", []):
|
|
532
|
+
items_to_process.append(
|
|
533
|
+
{
|
|
534
|
+
"block_id": page["id"],
|
|
535
|
+
"input_db_id": db_id,
|
|
536
|
+
"input_page_id": None,
|
|
537
|
+
"source_page_id": page["id"],
|
|
538
|
+
}
|
|
539
|
+
)
|
|
540
|
+
if result.get("has_more"):
|
|
541
|
+
cursor = result.get("next_cursor")
|
|
542
|
+
else:
|
|
543
|
+
break
|
|
544
|
+
except Exception as e:
|
|
545
|
+
logger.error(f"Failed to collect pages from database {db_id}: {e}")
|
|
546
|
+
|
|
547
|
+
items_loaded = True
|
|
548
|
+
|
|
549
|
+
if not items_to_process:
|
|
550
|
+
return SourceIteration(records=[], next_pagination={})
|
|
551
|
+
|
|
552
|
+
# Process a batch in parallel
|
|
553
|
+
batch_size = self.config.max_workers
|
|
554
|
+
batch = items_to_process[:batch_size]
|
|
555
|
+
items_to_process = items_to_process[batch_size:]
|
|
556
|
+
|
|
557
|
+
records = []
|
|
558
|
+
|
|
559
|
+
def fetch_item_blocks(item_info: dict) -> List[dict]:
|
|
560
|
+
"""Fetch all blocks for a database or page."""
|
|
561
|
+
return self.fetch_blocks_recursively(
|
|
562
|
+
block_id=item_info["block_id"],
|
|
563
|
+
parent_input_database_id=item_info["input_db_id"],
|
|
564
|
+
parent_input_page_id=item_info["input_page_id"],
|
|
565
|
+
source_page_id=item_info["source_page_id"],
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
569
|
+
futures = {executor.submit(fetch_item_blocks, item_info): item_info for item_info in batch}
|
|
570
|
+
for future in as_completed(futures):
|
|
571
|
+
item_info = futures[future]
|
|
572
|
+
try:
|
|
573
|
+
blocks = future.result()
|
|
574
|
+
for block in blocks:
|
|
575
|
+
records.append(SourceRecord(id=block["id"], data=block))
|
|
576
|
+
logger.info(f"Fetched {len(blocks)} blocks from {item_info['block_id']}")
|
|
577
|
+
except Exception as e:
|
|
578
|
+
logger.error(f"Failed to fetch blocks from {item_info['block_id']}: {e}")
|
|
579
|
+
|
|
580
|
+
next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
|
|
581
|
+
|
|
582
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
583
|
+
|
|
584
|
+
# ==================== HELPERS ====================
|
|
585
|
+
|
|
586
|
+
def get_filter_for_database(self, database_id: str) -> Optional[dict]:
|
|
587
|
+
"""Get the filter configured for a database, if any."""
|
|
588
|
+
return self.config.database_filters.get(database_id)
|
|
589
|
+
|
|
590
|
+
def _extract_title(self, database_data: dict) -> str:
|
|
591
|
+
"""Extract plain text title from database object."""
|
|
592
|
+
title_parts = database_data.get("title", [])
|
|
593
|
+
return "".join(part.get("plain_text", "") for part in title_parts)
|
|
594
|
+
|
|
595
|
+
# ==================== MARKDOWN CONVERSION ====================
|
|
596
|
+
|
|
597
|
+
def _extract_rich_text(self, rich_text_array: List[dict]) -> str:
|
|
598
|
+
"""Convert Notion rich text array to markdown string with formatting."""
|
|
599
|
+
if not rich_text_array:
|
|
600
|
+
return ""
|
|
601
|
+
|
|
602
|
+
result = []
|
|
603
|
+
for item in rich_text_array:
|
|
604
|
+
text = item.get("plain_text", "")
|
|
605
|
+
annotations = item.get("annotations", {})
|
|
606
|
+
href = item.get("href")
|
|
607
|
+
|
|
608
|
+
# Apply formatting in order: code, bold, italic, strikethrough
|
|
609
|
+
if annotations.get("code"):
|
|
610
|
+
text = f"`{text}`"
|
|
611
|
+
if annotations.get("bold"):
|
|
612
|
+
text = f"**{text}**"
|
|
613
|
+
if annotations.get("italic"):
|
|
614
|
+
text = f"*{text}*"
|
|
615
|
+
if annotations.get("strikethrough"):
|
|
616
|
+
text = f"~~{text}~~"
|
|
617
|
+
if href:
|
|
618
|
+
text = f"[{text}]({href})"
|
|
619
|
+
|
|
620
|
+
result.append(text)
|
|
621
|
+
|
|
622
|
+
return "".join(result)
|
|
623
|
+
|
|
624
|
+
def _block_to_markdown(self, block: dict) -> str:
|
|
625
|
+
"""Convert a single Notion block to markdown string."""
|
|
626
|
+
block_type = block.get("type", "")
|
|
627
|
+
content = block.get(block_type) or {}
|
|
628
|
+
|
|
629
|
+
# Text blocks
|
|
630
|
+
if block_type == "paragraph":
|
|
631
|
+
return self._extract_rich_text(content.get("rich_text", []))
|
|
632
|
+
|
|
633
|
+
elif block_type == "heading_1":
|
|
634
|
+
return f"# {self._extract_rich_text(content.get('rich_text', []))}"
|
|
635
|
+
|
|
636
|
+
elif block_type == "heading_2":
|
|
637
|
+
return f"## {self._extract_rich_text(content.get('rich_text', []))}"
|
|
638
|
+
|
|
639
|
+
elif block_type == "heading_3":
|
|
640
|
+
return f"### {self._extract_rich_text(content.get('rich_text', []))}"
|
|
641
|
+
|
|
642
|
+
elif block_type == "bulleted_list_item":
|
|
643
|
+
return f"- {self._extract_rich_text(content.get('rich_text', []))}"
|
|
644
|
+
|
|
645
|
+
elif block_type == "numbered_list_item":
|
|
646
|
+
return f"1. {self._extract_rich_text(content.get('rich_text', []))}"
|
|
647
|
+
|
|
648
|
+
elif block_type == "to_do":
|
|
649
|
+
checkbox = "[x]" if content.get("checked") else "[ ]"
|
|
650
|
+
return f"- {checkbox} {self._extract_rich_text(content.get('rich_text', []))}"
|
|
651
|
+
|
|
652
|
+
elif block_type == "quote":
|
|
653
|
+
return f"> {self._extract_rich_text(content.get('rich_text', []))}"
|
|
654
|
+
|
|
655
|
+
elif block_type == "callout":
|
|
656
|
+
icon = content.get("icon") or {}
|
|
657
|
+
emoji = icon.get("emoji", "💡")
|
|
658
|
+
text = self._extract_rich_text(content.get("rich_text", []))
|
|
659
|
+
return f"> {emoji} {text}"
|
|
660
|
+
|
|
661
|
+
elif block_type == "code":
|
|
662
|
+
language = content.get("language", "")
|
|
663
|
+
code_text = self._extract_rich_text(content.get("rich_text", []))
|
|
664
|
+
return f"```{language}\n{code_text}\n```"
|
|
665
|
+
|
|
666
|
+
elif block_type == "equation":
|
|
667
|
+
return f"$$ {content.get('expression', '')} $$"
|
|
668
|
+
|
|
669
|
+
elif block_type == "divider":
|
|
670
|
+
return "---"
|
|
671
|
+
|
|
672
|
+
elif block_type == "toggle":
|
|
673
|
+
return f"<details><summary>{self._extract_rich_text(content.get('rich_text', []))}</summary></details>"
|
|
674
|
+
|
|
675
|
+
# Media blocks
|
|
676
|
+
elif block_type == "image":
|
|
677
|
+
url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
|
|
678
|
+
caption = self._extract_rich_text(content.get("caption", []))
|
|
679
|
+
return f""
|
|
680
|
+
|
|
681
|
+
elif block_type == "video":
|
|
682
|
+
url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
|
|
683
|
+
return f"[Video]({url})"
|
|
684
|
+
|
|
685
|
+
elif block_type == "file":
|
|
686
|
+
url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
|
|
687
|
+
caption = self._extract_rich_text(content.get("caption", [])) or "File"
|
|
688
|
+
return f"[{caption}]({url})"
|
|
689
|
+
|
|
690
|
+
elif block_type == "pdf":
|
|
691
|
+
url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
|
|
692
|
+
return f"[PDF]({url})"
|
|
693
|
+
|
|
694
|
+
elif block_type == "bookmark":
|
|
695
|
+
url = content.get("url", "")
|
|
696
|
+
caption = self._extract_rich_text(content.get("caption", [])) or url
|
|
697
|
+
return f"[{caption}]({url})"
|
|
698
|
+
|
|
699
|
+
elif block_type == "embed":
|
|
700
|
+
return f"[Embed]({content.get('url', '')})"
|
|
701
|
+
|
|
702
|
+
elif block_type == "link_preview":
|
|
703
|
+
return f"[Link Preview]({content.get('url', '')})"
|
|
704
|
+
|
|
705
|
+
# Table blocks
|
|
706
|
+
elif block_type == "table":
|
|
707
|
+
return "[Table - see child blocks for rows]"
|
|
708
|
+
|
|
709
|
+
elif block_type == "table_row":
|
|
710
|
+
cells = content.get("cells", [])
|
|
711
|
+
row = " | ".join(self._extract_rich_text(cell) for cell in cells)
|
|
712
|
+
return f"| {row} |"
|
|
713
|
+
|
|
714
|
+
# Database/page references
|
|
715
|
+
elif block_type == "child_page":
|
|
716
|
+
return f"[Page: {content.get('title', 'Untitled')}]"
|
|
717
|
+
|
|
718
|
+
elif block_type == "child_database":
|
|
719
|
+
return f"[Database: {content.get('title', 'Untitled')}]"
|
|
720
|
+
|
|
721
|
+
elif block_type == "link_to_page":
|
|
722
|
+
page_id = content.get("page_id") or content.get("database_id", "")
|
|
723
|
+
return f"[Link to page: {page_id}]"
|
|
724
|
+
|
|
725
|
+
elif block_type == "table_of_contents":
|
|
726
|
+
return "[Table of Contents]"
|
|
727
|
+
|
|
728
|
+
elif block_type == "breadcrumb":
|
|
729
|
+
return "[Breadcrumb]"
|
|
730
|
+
|
|
731
|
+
elif block_type == "synced_block":
|
|
732
|
+
return "[Synced Block]"
|
|
733
|
+
|
|
734
|
+
elif block_type == "template":
|
|
735
|
+
return "[Template]"
|
|
736
|
+
|
|
737
|
+
elif block_type == "column_list":
|
|
738
|
+
return "" # Column list is just a container
|
|
739
|
+
|
|
740
|
+
elif block_type == "column":
|
|
741
|
+
return "" # Column is just a container
|
|
742
|
+
|
|
743
|
+
else:
|
|
744
|
+
return f"[Unsupported block type: {block_type}]"
|
|
745
|
+
|
|
746
|
+
def get_blocks_markdown(self, pagination: dict = None) -> SourceIteration:
|
|
747
|
+
"""
|
|
748
|
+
Fetch blocks and convert them to markdown.
|
|
749
|
+
Returns one record per block with its markdown content.
|
|
750
|
+
"""
|
|
751
|
+
if pagination:
|
|
752
|
+
items_to_process = pagination.get("items_to_process", [])
|
|
753
|
+
items_loaded = pagination.get("items_loaded", False)
|
|
754
|
+
else:
|
|
755
|
+
items_to_process = []
|
|
756
|
+
items_loaded = False
|
|
757
|
+
|
|
758
|
+
# Collect all database IDs and page IDs to fetch blocks from
|
|
759
|
+
if not items_loaded:
|
|
760
|
+
for page_id in self.config.page_ids:
|
|
761
|
+
items_to_process.append(
|
|
762
|
+
{
|
|
763
|
+
"block_id": page_id,
|
|
764
|
+
"input_db_id": None,
|
|
765
|
+
"input_page_id": page_id,
|
|
766
|
+
"source_page_id": page_id,
|
|
767
|
+
}
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
for db_id in self.config.database_ids:
|
|
771
|
+
# Collect pages from database's data_sources
|
|
772
|
+
try:
|
|
773
|
+
db_filter = self.get_filter_for_database(db_id)
|
|
774
|
+
db_data = self.get_database(db_id)
|
|
775
|
+
for ds in db_data.get("data_sources", []):
|
|
776
|
+
cursor = None
|
|
777
|
+
while True:
|
|
778
|
+
result = self.query_data_source(ds["id"], cursor, filter=db_filter)
|
|
779
|
+
for page in result.get("results", []):
|
|
780
|
+
items_to_process.append(
|
|
781
|
+
{
|
|
782
|
+
"block_id": page["id"],
|
|
783
|
+
"input_db_id": db_id,
|
|
784
|
+
"input_page_id": None,
|
|
785
|
+
"source_page_id": page["id"],
|
|
786
|
+
}
|
|
787
|
+
)
|
|
788
|
+
if result.get("has_more"):
|
|
789
|
+
cursor = result.get("next_cursor")
|
|
790
|
+
else:
|
|
791
|
+
break
|
|
792
|
+
except Exception as e:
|
|
793
|
+
logger.error(f"Failed to collect pages from database {db_id}: {e}")
|
|
794
|
+
|
|
795
|
+
items_loaded = True
|
|
796
|
+
|
|
797
|
+
if not items_to_process:
|
|
798
|
+
return SourceIteration(records=[], next_pagination={})
|
|
799
|
+
|
|
800
|
+
# Process a batch in parallel
|
|
801
|
+
batch_size = self.config.max_workers
|
|
802
|
+
batch = items_to_process[:batch_size]
|
|
803
|
+
items_to_process = items_to_process[batch_size:]
|
|
804
|
+
|
|
805
|
+
records = []
|
|
806
|
+
|
|
807
|
+
def fetch_and_convert_item(item_info: dict) -> List[dict]:
|
|
808
|
+
"""Fetch blocks for a database or page and convert each to markdown."""
|
|
809
|
+
blocks = self.fetch_blocks_recursively(
|
|
810
|
+
block_id=item_info["block_id"],
|
|
811
|
+
parent_input_database_id=item_info["input_db_id"],
|
|
812
|
+
parent_input_page_id=item_info["input_page_id"],
|
|
813
|
+
source_page_id=item_info["source_page_id"],
|
|
814
|
+
fetch_child_databases=False,
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
# Convert each block to markdown record
|
|
818
|
+
block_records = []
|
|
819
|
+
for block in blocks or []:
|
|
820
|
+
if not block:
|
|
821
|
+
continue
|
|
822
|
+
md = self._block_to_markdown(block)
|
|
823
|
+
block_records.append(
|
|
824
|
+
{
|
|
825
|
+
"block_id": block.get("id"),
|
|
826
|
+
"block_type": block.get("type"),
|
|
827
|
+
"markdown": md,
|
|
828
|
+
"source_page_id": block.get("source_page_id"),
|
|
829
|
+
"parent_block_id": block.get("parent_block_id"),
|
|
830
|
+
"parent_input_database_id": block.get("parent_input_database_id"),
|
|
831
|
+
"parent_input_page_id": block.get("parent_input_page_id"),
|
|
832
|
+
"depth": block.get("depth"),
|
|
833
|
+
"block_order": block.get("block_order"),
|
|
834
|
+
"page_order": block.get("page_order"),
|
|
835
|
+
"block_raw": block,
|
|
836
|
+
}
|
|
837
|
+
)
|
|
838
|
+
return block_records
|
|
839
|
+
|
|
840
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
841
|
+
futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
|
|
842
|
+
for future in as_completed(futures):
|
|
843
|
+
item_info = futures[future]
|
|
844
|
+
try:
|
|
845
|
+
block_records = future.result()
|
|
846
|
+
for block_record in block_records:
|
|
847
|
+
records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
|
|
848
|
+
logger.info(f"Converted {len(block_records)} blocks to markdown from {item_info['block_id']}")
|
|
849
|
+
except Exception as e:
|
|
850
|
+
import traceback
|
|
851
|
+
|
|
852
|
+
logger.error(
|
|
853
|
+
f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}\n{traceback.format_exc()}"
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
|
|
857
|
+
|
|
858
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
859
|
+
|
|
860
|
+
# ==================== SEARCH API (ALL_* STREAMS) ====================
|
|
861
|
+
|
|
862
|
+
def search(self, start_cursor: str = None) -> dict:
|
|
863
|
+
"""
|
|
864
|
+
Search all pages and databases accessible to the integration.
|
|
865
|
+
|
|
866
|
+
Args:
|
|
867
|
+
start_cursor: Pagination cursor
|
|
868
|
+
|
|
869
|
+
Returns:
|
|
870
|
+
Raw search results (filter client-side by object type)
|
|
871
|
+
"""
|
|
872
|
+
payload = {"page_size": self.config.page_size}
|
|
873
|
+
if start_cursor:
|
|
874
|
+
payload["start_cursor"] = start_cursor
|
|
875
|
+
|
|
876
|
+
response = self.session.post(f"{BASE_URL}/search", json=payload)
|
|
877
|
+
response.raise_for_status()
|
|
878
|
+
return response.json()
|
|
879
|
+
|
|
880
|
+
def search_by_type(self, object_type: str, start_cursor: str = None) -> dict:
|
|
881
|
+
"""
|
|
882
|
+
Search and filter by object type client-side.
|
|
883
|
+
|
|
884
|
+
Args:
|
|
885
|
+
object_type: "page" or "database"
|
|
886
|
+
start_cursor: Pagination cursor
|
|
887
|
+
|
|
888
|
+
Returns:
|
|
889
|
+
Filtered results matching object_type
|
|
890
|
+
"""
|
|
891
|
+
result = self.search(start_cursor=start_cursor)
|
|
892
|
+
|
|
893
|
+
# Filter results by object type
|
|
894
|
+
filtered_results = [item for item in result.get("results", []) if item.get("object") == object_type]
|
|
895
|
+
|
|
896
|
+
return {
|
|
897
|
+
"results": filtered_results,
|
|
898
|
+
"has_more": result.get("has_more", False),
|
|
899
|
+
"next_cursor": result.get("next_cursor"),
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
def get_all_databases(self, pagination: dict = None) -> SourceIteration:
|
|
903
|
+
"""
|
|
904
|
+
Fetch all databases accessible to the integration.
|
|
905
|
+
In 2025-09-03 API, we get data_sources from search and fetch their parent databases.
|
|
906
|
+
"""
|
|
907
|
+
if pagination:
|
|
908
|
+
db_ids_to_fetch = pagination.get("db_ids_to_fetch", [])
|
|
909
|
+
dbs_loaded = pagination.get("dbs_loaded", False)
|
|
910
|
+
else:
|
|
911
|
+
db_ids_to_fetch = []
|
|
912
|
+
dbs_loaded = False
|
|
913
|
+
|
|
914
|
+
# Collect unique database IDs from data_sources
|
|
915
|
+
if not dbs_loaded:
|
|
916
|
+
seen_db_ids = set()
|
|
917
|
+
search_cursor = None
|
|
918
|
+
|
|
919
|
+
while True:
|
|
920
|
+
result = self.search_by_type(object_type="data_source", start_cursor=search_cursor)
|
|
921
|
+
for ds in result.get("results", []):
|
|
922
|
+
# Data sources have a parent.database_id
|
|
923
|
+
parent = ds.get("parent", {})
|
|
924
|
+
if parent.get("type") == "database_id":
|
|
925
|
+
db_id = parent.get("database_id")
|
|
926
|
+
if db_id and db_id not in seen_db_ids:
|
|
927
|
+
seen_db_ids.add(db_id)
|
|
928
|
+
db_ids_to_fetch.append(db_id)
|
|
929
|
+
|
|
930
|
+
if result.get("has_more"):
|
|
931
|
+
search_cursor = result.get("next_cursor")
|
|
932
|
+
else:
|
|
933
|
+
break
|
|
934
|
+
|
|
935
|
+
dbs_loaded = True
|
|
936
|
+
logger.info(f"Found {len(db_ids_to_fetch)} unique databases from data_sources")
|
|
937
|
+
|
|
938
|
+
if not db_ids_to_fetch:
|
|
939
|
+
return SourceIteration(records=[], next_pagination={})
|
|
940
|
+
|
|
941
|
+
# Fetch one database at a time
|
|
942
|
+
db_id = db_ids_to_fetch[0]
|
|
943
|
+
db_ids_to_fetch = db_ids_to_fetch[1:]
|
|
944
|
+
|
|
945
|
+
records = []
|
|
946
|
+
try:
|
|
947
|
+
db_data = self.get_database(db_id)
|
|
948
|
+
records.append(SourceRecord(id=db_data["id"], data=db_data))
|
|
949
|
+
except Exception as e:
|
|
950
|
+
logger.error(f"Failed to fetch database {db_id}: {e}")
|
|
951
|
+
|
|
952
|
+
next_pagination = {"db_ids_to_fetch": db_ids_to_fetch, "dbs_loaded": True} if db_ids_to_fetch else {}
|
|
953
|
+
|
|
954
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
955
|
+
|
|
956
|
+
def get_all_data_sources(self, pagination: dict = None) -> SourceIteration:
|
|
957
|
+
"""Fetch all data_sources accessible to the integration."""
|
|
958
|
+
cursor = pagination.get("start_cursor") if pagination else None
|
|
959
|
+
|
|
960
|
+
result = self.search_by_type(object_type="data_source", start_cursor=cursor)
|
|
961
|
+
|
|
962
|
+
records = [SourceRecord(id=ds["id"], data=ds) for ds in result.get("results", [])]
|
|
963
|
+
|
|
964
|
+
next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
|
|
965
|
+
|
|
966
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
967
|
+
|
|
968
|
+
def get_all_pages(self, pagination: dict = None) -> SourceIteration:
|
|
969
|
+
"""Fetch all pages accessible to the integration."""
|
|
970
|
+
cursor = pagination.get("start_cursor") if pagination else None
|
|
971
|
+
|
|
972
|
+
result = self.search_by_type(object_type="page", start_cursor=cursor)
|
|
973
|
+
|
|
974
|
+
records = [SourceRecord(id=page["id"], data=page) for page in result.get("results", [])]
|
|
975
|
+
|
|
976
|
+
next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
|
|
977
|
+
|
|
978
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
979
|
+
|
|
980
|
+
def get_all_blocks_markdown(self, pagination: dict = None) -> SourceIteration:
|
|
981
|
+
"""
|
|
982
|
+
Fetch all databases and pages accessible to the integration and convert their blocks to markdown.
|
|
983
|
+
Includes databases and pages from search API AND pages from all databases via data_sources.
|
|
984
|
+
"""
|
|
985
|
+
if pagination:
|
|
986
|
+
items_to_process = pagination.get("items_to_process", [])
|
|
987
|
+
items_loaded = pagination.get("items_loaded", False)
|
|
988
|
+
else:
|
|
989
|
+
items_to_process = []
|
|
990
|
+
items_loaded = False
|
|
991
|
+
|
|
992
|
+
# Collect databases and pages from search API
|
|
993
|
+
if not items_loaded:
|
|
994
|
+
seen_ids = set()
|
|
995
|
+
|
|
996
|
+
# 1. Get pages from search API
|
|
997
|
+
search_cursor = None
|
|
998
|
+
while True:
|
|
999
|
+
result = self.search_by_type(object_type="page", start_cursor=search_cursor)
|
|
1000
|
+
for page in result.get("results", []):
|
|
1001
|
+
if page["id"] not in seen_ids:
|
|
1002
|
+
seen_ids.add(page["id"])
|
|
1003
|
+
items_to_process.append(
|
|
1004
|
+
{
|
|
1005
|
+
"block_id": page["id"],
|
|
1006
|
+
"input_db_id": None,
|
|
1007
|
+
"input_page_id": None,
|
|
1008
|
+
"source_page_id": page["id"],
|
|
1009
|
+
}
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
if result.get("has_more"):
|
|
1013
|
+
search_cursor = result.get("next_cursor")
|
|
1014
|
+
else:
|
|
1015
|
+
break
|
|
1016
|
+
|
|
1017
|
+
logger.info(f"Found {len(items_to_process)} pages from search API")
|
|
1018
|
+
|
|
1019
|
+
# 2. Get all data_sources and their parent databases + query for pages
|
|
1020
|
+
ds_search_cursor = None
|
|
1021
|
+
while True:
|
|
1022
|
+
result = self.search_by_type(object_type="data_source", start_cursor=ds_search_cursor)
|
|
1023
|
+
for ds in result.get("results", []):
|
|
1024
|
+
ds_id = ds["id"]
|
|
1025
|
+
# Get parent database_id from data_source
|
|
1026
|
+
parent = ds.get("parent", {})
|
|
1027
|
+
parent_db_id = parent.get("database_id") if parent.get("type") == "database_id" else None
|
|
1028
|
+
|
|
1029
|
+
# Add the parent database to fetch its blocks (headers, descriptions, etc.)
|
|
1030
|
+
if parent_db_id and parent_db_id not in seen_ids:
|
|
1031
|
+
seen_ids.add(parent_db_id)
|
|
1032
|
+
items_to_process.append(
|
|
1033
|
+
{
|
|
1034
|
+
"block_id": parent_db_id,
|
|
1035
|
+
"input_db_id": parent_db_id,
|
|
1036
|
+
"input_page_id": None,
|
|
1037
|
+
"source_page_id": None,
|
|
1038
|
+
}
|
|
1039
|
+
)
|
|
1040
|
+
|
|
1041
|
+
try:
|
|
1042
|
+
# Query data_source for pages (no filter for all_* streams)
|
|
1043
|
+
ds_cursor = None
|
|
1044
|
+
while True:
|
|
1045
|
+
ds_result = self.query_data_source(ds_id, ds_cursor)
|
|
1046
|
+
for page in ds_result.get("results", []):
|
|
1047
|
+
if page["id"] not in seen_ids:
|
|
1048
|
+
seen_ids.add(page["id"])
|
|
1049
|
+
items_to_process.append(
|
|
1050
|
+
{
|
|
1051
|
+
"block_id": page["id"],
|
|
1052
|
+
"input_db_id": parent_db_id,
|
|
1053
|
+
"input_page_id": None,
|
|
1054
|
+
"source_page_id": page["id"],
|
|
1055
|
+
}
|
|
1056
|
+
)
|
|
1057
|
+
if ds_result.get("has_more"):
|
|
1058
|
+
ds_cursor = ds_result.get("next_cursor")
|
|
1059
|
+
else:
|
|
1060
|
+
break
|
|
1061
|
+
except Exception as e:
|
|
1062
|
+
logger.error(f"Failed to get pages from data_source {ds_id}: {e}")
|
|
1063
|
+
|
|
1064
|
+
if result.get("has_more"):
|
|
1065
|
+
ds_search_cursor = result.get("next_cursor")
|
|
1066
|
+
else:
|
|
1067
|
+
break
|
|
1068
|
+
|
|
1069
|
+
items_loaded = True
|
|
1070
|
+
logger.info(
|
|
1071
|
+
f"Total {len(items_to_process)} unique items (databases + pages) to process for all_blocks_markdown"
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
if not items_to_process:
|
|
1075
|
+
return SourceIteration(records=[], next_pagination={})
|
|
1076
|
+
|
|
1077
|
+
# Process a batch in parallel
|
|
1078
|
+
batch_size = self.config.max_workers
|
|
1079
|
+
batch = items_to_process[:batch_size]
|
|
1080
|
+
items_to_process = items_to_process[batch_size:]
|
|
1081
|
+
|
|
1082
|
+
records = []
|
|
1083
|
+
|
|
1084
|
+
def fetch_and_convert_item(item_info: dict) -> List[dict]:
|
|
1085
|
+
"""Fetch blocks for a database or page and convert each to markdown."""
|
|
1086
|
+
# fetch_child_databases=False because all_blocks_markdown already collects
|
|
1087
|
+
# all pages from all data_sources, so we don't need to recurse into child_database blocks
|
|
1088
|
+
blocks = self.fetch_blocks_recursively(
|
|
1089
|
+
block_id=item_info["block_id"],
|
|
1090
|
+
parent_input_database_id=item_info["input_db_id"],
|
|
1091
|
+
parent_input_page_id=item_info["input_page_id"],
|
|
1092
|
+
source_page_id=item_info["source_page_id"],
|
|
1093
|
+
fetch_child_databases=False,
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
# Convert each block to markdown record
|
|
1097
|
+
block_records = []
|
|
1098
|
+
for block in blocks or []:
|
|
1099
|
+
if not block:
|
|
1100
|
+
continue
|
|
1101
|
+
md = self._block_to_markdown(block)
|
|
1102
|
+
block_records.append(
|
|
1103
|
+
{
|
|
1104
|
+
"block_id": block.get("id"),
|
|
1105
|
+
"block_type": block.get("type"),
|
|
1106
|
+
"markdown": md,
|
|
1107
|
+
"source_page_id": block.get("source_page_id"),
|
|
1108
|
+
"parent_block_id": block.get("parent_block_id"),
|
|
1109
|
+
"parent_input_database_id": block.get("parent_input_database_id"),
|
|
1110
|
+
"parent_input_page_id": block.get("parent_input_page_id"),
|
|
1111
|
+
"depth": block.get("depth"),
|
|
1112
|
+
"block_order": block.get("block_order"),
|
|
1113
|
+
"page_order": block.get("page_order"),
|
|
1114
|
+
"block_raw": block,
|
|
1115
|
+
}
|
|
1116
|
+
)
|
|
1117
|
+
return block_records
|
|
1118
|
+
|
|
1119
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
1120
|
+
futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
|
|
1121
|
+
for future in as_completed(futures):
|
|
1122
|
+
item_info = futures[future]
|
|
1123
|
+
try:
|
|
1124
|
+
block_records = future.result()
|
|
1125
|
+
for block_record in block_records:
|
|
1126
|
+
records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
|
|
1127
|
+
logger.info(f"Converted {len(block_records)} blocks to markdown from {item_info['block_id']}")
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
logger.error(f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}")
|
|
1130
|
+
|
|
1131
|
+
next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
|
|
1132
|
+
|
|
1133
|
+
return SourceIteration(records=records, next_pagination=next_pagination)
|
|
1134
|
+
|
|
1135
|
+
# ==================== MAIN DISPATCH ====================
|
|
1136
|
+
|
|
1137
|
+
def get(self, pagination: dict = None) -> SourceIteration:
|
|
1138
|
+
if self.config.stream == NotionStreams.USERS:
|
|
1139
|
+
return self.get_users(pagination)
|
|
1140
|
+
elif self.config.stream == NotionStreams.DATABASES:
|
|
1141
|
+
return self.get_databases(pagination)
|
|
1142
|
+
elif self.config.stream == NotionStreams.DATA_SOURCES:
|
|
1143
|
+
return self.get_data_sources(pagination)
|
|
1144
|
+
elif self.config.stream == NotionStreams.PAGES:
|
|
1145
|
+
return self.get_pages(pagination)
|
|
1146
|
+
elif self.config.stream == NotionStreams.BLOCKS:
|
|
1147
|
+
return self.get_blocks(pagination)
|
|
1148
|
+
elif self.config.stream == NotionStreams.BLOCKS_MARKDOWN:
|
|
1149
|
+
return self.get_blocks_markdown(pagination)
|
|
1150
|
+
elif self.config.stream == NotionStreams.ALL_PAGES:
|
|
1151
|
+
return self.get_all_pages(pagination)
|
|
1152
|
+
elif self.config.stream == NotionStreams.ALL_DATABASES:
|
|
1153
|
+
return self.get_all_databases(pagination)
|
|
1154
|
+
elif self.config.stream == NotionStreams.ALL_DATA_SOURCES:
|
|
1155
|
+
return self.get_all_data_sources(pagination)
|
|
1156
|
+
elif self.config.stream == NotionStreams.ALL_BLOCKS_MARKDOWN:
|
|
1157
|
+
return self.get_all_blocks_markdown(pagination)
|
|
1158
|
+
|
|
1159
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for Notion")
|