bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. bizon/alerting/alerts.py +0 -1
  2. bizon/common/models.py +182 -4
  3. bizon/connectors/destinations/bigquery/src/config.py +0 -1
  4. bizon/connectors/destinations/bigquery/src/destination.py +11 -8
  5. bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
  6. bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
  7. bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
  8. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
  9. bizon/connectors/destinations/file/config/file.example.yml +40 -0
  10. bizon/connectors/destinations/file/src/config.py +1 -1
  11. bizon/connectors/destinations/file/src/destination.py +0 -5
  12. bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
  13. bizon/connectors/destinations/logger/src/config.py +0 -2
  14. bizon/connectors/destinations/logger/src/destination.py +1 -2
  15. bizon/connectors/sources/cycle/src/source.py +2 -6
  16. bizon/connectors/sources/dummy/src/source.py +0 -4
  17. bizon/connectors/sources/gsheets/src/source.py +2 -3
  18. bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
  19. bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
  20. bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
  21. bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
  22. bizon/connectors/sources/kafka/src/config.py +10 -6
  23. bizon/connectors/sources/kafka/src/decode.py +2 -2
  24. bizon/connectors/sources/kafka/src/source.py +147 -46
  25. bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
  26. bizon/connectors/sources/notion/src/__init__.py +0 -0
  27. bizon/connectors/sources/notion/src/config.py +59 -0
  28. bizon/connectors/sources/notion/src/source.py +1159 -0
  29. bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
  30. bizon/connectors/sources/notion/tests/test_notion.py +113 -0
  31. bizon/connectors/sources/periscope/src/source.py +0 -6
  32. bizon/connectors/sources/pokeapi/src/source.py +0 -1
  33. bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
  34. bizon/connectors/sources/sana_ai/src/source.py +85 -0
  35. bizon/destination/buffer.py +0 -1
  36. bizon/destination/config.py +0 -1
  37. bizon/destination/destination.py +1 -4
  38. bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
  39. bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
  40. bizon/engine/config.py +0 -1
  41. bizon/engine/engine.py +0 -1
  42. bizon/engine/pipeline/consumer.py +0 -1
  43. bizon/engine/pipeline/producer.py +1 -5
  44. bizon/engine/queue/adapters/kafka/config.py +1 -1
  45. bizon/engine/queue/adapters/kafka/queue.py +0 -1
  46. bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
  47. bizon/engine/queue/adapters/python_queue/queue.py +0 -2
  48. bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
  49. bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
  50. bizon/engine/queue/config.py +0 -2
  51. bizon/engine/runner/adapters/process.py +0 -2
  52. bizon/engine/runner/adapters/streaming.py +55 -1
  53. bizon/engine/runner/adapters/thread.py +0 -2
  54. bizon/engine/runner/config.py +0 -1
  55. bizon/engine/runner/runner.py +0 -2
  56. bizon/monitoring/datadog/monitor.py +5 -3
  57. bizon/monitoring/noop/monitor.py +1 -1
  58. bizon/source/auth/authenticators/abstract_oauth.py +11 -3
  59. bizon/source/auth/authenticators/abstract_token.py +2 -1
  60. bizon/source/auth/authenticators/basic.py +1 -1
  61. bizon/source/auth/authenticators/cookies.py +2 -1
  62. bizon/source/auth/authenticators/oauth.py +8 -3
  63. bizon/source/config.py +0 -2
  64. bizon/source/cursor.py +8 -16
  65. bizon/source/discover.py +3 -6
  66. bizon/source/models.py +0 -1
  67. bizon/source/session.py +0 -1
  68. bizon/source/source.py +17 -2
  69. bizon/transform/config.py +0 -2
  70. bizon/transform/transform.py +0 -3
  71. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
  72. bizon-0.2.0.dist-info/RECORD +136 -0
  73. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
  74. bizon-0.2.0.dist-info/entry_points.txt +2 -0
  75. bizon-0.1.2.dist-info/RECORD +0 -123
  76. bizon-0.1.2.dist-info/entry_points.txt +0 -3
  77. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1159 @@
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from typing import Any, List, Optional, Tuple
3
+
4
+ from loguru import logger
5
+ from requests import Session
6
+ from requests.adapters import HTTPAdapter
7
+ from requests.auth import AuthBase
8
+ from urllib3.util.retry import Retry
9
+
10
+ from bizon.source.auth.builder import AuthBuilder
11
+ from bizon.source.auth.config import AuthType
12
+ from bizon.source.config import SourceConfig
13
+ from bizon.source.models import SourceIteration, SourceRecord
14
+ from bizon.source.source import AbstractSource
15
+
16
+ from .config import NotionSourceConfig, NotionStreams
17
+
18
+ NOTION_API_VERSION = "2025-09-03"
19
+ BASE_URL = "https://api.notion.com/v1"
20
+
21
+
22
+ class NotionSource(AbstractSource):
23
+ def __init__(self, config: NotionSourceConfig):
24
+ super().__init__(config)
25
+ self.config: NotionSourceConfig = config
26
+
27
+ def get_session(self) -> Session:
28
+ """Create a session with retry logic and required Notion headers."""
29
+ session = Session()
30
+ retries = Retry(
31
+ total=10,
32
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32... seconds
33
+ status_forcelist=[429, 500, 502, 503, 504],
34
+ allowed_methods=["GET", "POST", "PATCH"], # Retry on POST/PATCH too
35
+ respect_retry_after_header=True, # Honor Notion's Retry-After header
36
+ )
37
+ session.mount("https://", HTTPAdapter(max_retries=retries))
38
+ session.headers.update(
39
+ {
40
+ "Notion-Version": NOTION_API_VERSION,
41
+ "Content-Type": "application/json",
42
+ }
43
+ )
44
+ return session
45
+
46
+ def get_authenticator(self) -> AuthBase:
47
+ if self.config.authentication.type.value in [AuthType.API_KEY, AuthType.BEARER]:
48
+ return AuthBuilder.token(params=self.config.authentication.params)
49
+ return None
50
+
51
+ @staticmethod
52
+ def streams() -> List[str]:
53
+ return [item.value for item in NotionStreams]
54
+
55
+ @staticmethod
56
+ def get_config_class() -> SourceConfig:
57
+ return NotionSourceConfig
58
+
59
+ def check_connection(self) -> Tuple[bool, Optional[Any]]:
60
+ """Test connection by listing users."""
61
+ try:
62
+ response = self.session.get(f"{BASE_URL}/users")
63
+ response.raise_for_status()
64
+ return True, None
65
+ except Exception as e:
66
+ return False, str(e)
67
+
68
+ def get_total_records_count(self) -> Optional[int]:
69
+ return None
70
+
71
+ # ==================== USERS STREAM ====================
72
+
73
+ def get_users(self, pagination: dict = None) -> SourceIteration:
74
+ """Fetch all users accessible to the integration."""
75
+ params = {"page_size": self.config.page_size}
76
+ if pagination and pagination.get("start_cursor"):
77
+ params["start_cursor"] = pagination["start_cursor"]
78
+
79
+ response = self.session.get(f"{BASE_URL}/users", params=params)
80
+ response.raise_for_status()
81
+ data = response.json()
82
+
83
+ records = [SourceRecord(id=user["id"], data=user) for user in data.get("results", [])]
84
+
85
+ next_pagination = {"start_cursor": data.get("next_cursor")} if data.get("has_more") else {}
86
+
87
+ return SourceIteration(records=records, next_pagination=next_pagination)
88
+
89
+ # ==================== DATABASES STREAM ====================
90
+
91
+ def get_database(self, database_id: str) -> dict:
92
+ """Fetch a single database by ID."""
93
+ response = self.session.get(f"{BASE_URL}/databases/{database_id}")
94
+ response.raise_for_status()
95
+ return response.json()
96
+
97
+ def get_databases(self, pagination: dict = None) -> SourceIteration:
98
+ """Fetch databases for the configured database_ids."""
99
+ if not self.config.database_ids:
100
+ logger.warning("No database_ids configured, returning empty results")
101
+ return SourceIteration(records=[], next_pagination={})
102
+
103
+ # Track progress through database_ids list
104
+ if pagination:
105
+ remaining_ids = pagination.get("remaining_ids", [])
106
+ else:
107
+ remaining_ids = list(self.config.database_ids)
108
+
109
+ if not remaining_ids:
110
+ return SourceIteration(records=[], next_pagination={})
111
+
112
+ # Process one database at a time
113
+ database_id = remaining_ids[0]
114
+ remaining_ids = remaining_ids[1:]
115
+
116
+ try:
117
+ database_data = self.get_database(database_id)
118
+ records = [SourceRecord(id=database_data["id"], data=database_data)]
119
+ except Exception as e:
120
+ logger.error(f"Failed to fetch database {database_id}: {e}")
121
+ records = []
122
+
123
+ next_pagination = {"remaining_ids": remaining_ids} if remaining_ids else {}
124
+
125
+ return SourceIteration(records=records, next_pagination=next_pagination)
126
+
127
+ # ==================== DATA SOURCES STREAM ====================
128
+
129
+ def get_data_sources(self, pagination: dict = None) -> SourceIteration:
130
+ """
131
+ Fetch data sources from databases.
132
+ In the 2025-09-03 API, databases contain a data_sources array.
133
+ """
134
+ if not self.config.database_ids:
135
+ logger.warning("No database_ids configured, returning empty results")
136
+ return SourceIteration(records=[], next_pagination={})
137
+
138
+ if pagination:
139
+ remaining_ids = pagination.get("remaining_ids", [])
140
+ else:
141
+ remaining_ids = list(self.config.database_ids)
142
+
143
+ if not remaining_ids:
144
+ return SourceIteration(records=[], next_pagination={})
145
+
146
+ database_id = remaining_ids[0]
147
+ remaining_ids = remaining_ids[1:]
148
+
149
+ records = []
150
+ try:
151
+ database_data = self.get_database(database_id)
152
+ data_sources = database_data.get("data_sources", [])
153
+
154
+ for ds in data_sources:
155
+ # Enrich data source with parent database info
156
+ ds_record = {
157
+ **ds,
158
+ "parent_database_id": database_id,
159
+ "parent_database_title": self._extract_title(database_data),
160
+ }
161
+ records.append(SourceRecord(id=ds["id"], data=ds_record))
162
+
163
+ except Exception as e:
164
+ logger.error(f"Failed to fetch data sources from database {database_id}: {e}")
165
+
166
+ next_pagination = {"remaining_ids": remaining_ids} if remaining_ids else {}
167
+
168
+ return SourceIteration(records=records, next_pagination=next_pagination)
169
+
170
+ # ==================== PAGES STREAM ====================
171
+
172
+ def query_data_source(self, data_source_id: str, start_cursor: str = None, filter: dict = None) -> dict:
173
+ """Query a data source for its pages."""
174
+ payload = {"page_size": self.config.page_size}
175
+ if start_cursor:
176
+ payload["start_cursor"] = start_cursor
177
+ if filter:
178
+ payload["filter"] = filter
179
+
180
+ response = self.session.post(f"{BASE_URL}/data_sources/{data_source_id}/query", json=payload)
181
+ response.raise_for_status()
182
+ return response.json()
183
+
184
+ def get_page(self, page_id: str) -> dict:
185
+ """Fetch a single page by ID."""
186
+ response = self.session.get(f"{BASE_URL}/pages/{page_id}")
187
+ response.raise_for_status()
188
+ return response.json()
189
+
190
+ def get_pages(self, pagination: dict = None) -> SourceIteration:
191
+ """
192
+ Fetch pages from data sources (querying databases) and/or specific page IDs.
193
+ """
194
+ records = []
195
+
196
+ if pagination:
197
+ # Continue previous pagination state
198
+ # remaining_data_sources is list of {"ds_id": str, "db_id": str}
199
+ remaining_data_sources = pagination.get("remaining_data_sources", [])
200
+ current_data_source = pagination.get("current_data_source") # {"ds_id": str, "db_id": str}
201
+ data_source_cursor = pagination.get("data_source_cursor")
202
+ remaining_page_ids = pagination.get("remaining_page_ids", [])
203
+ data_sources_loaded = pagination.get("data_sources_loaded", False)
204
+ else:
205
+ remaining_data_sources = []
206
+ current_data_source = None
207
+ data_source_cursor = None
208
+ remaining_page_ids = list(self.config.page_ids)
209
+ data_sources_loaded = False
210
+
211
+ # First, load all data sources from databases if not done
212
+ if not data_sources_loaded and self.config.database_ids:
213
+ for db_id in self.config.database_ids:
214
+ try:
215
+ db_data = self.get_database(db_id)
216
+ for ds in db_data.get("data_sources", []):
217
+ remaining_data_sources.append({"ds_id": ds["id"], "db_id": db_id})
218
+ except Exception as e:
219
+ logger.error(f"Failed to get data sources from database {db_id}: {e}")
220
+ data_sources_loaded = True
221
+
222
+ # Process current data source if we have one with a cursor
223
+ if current_data_source and data_source_cursor:
224
+ try:
225
+ ds_filter = self.get_filter_for_database(current_data_source["db_id"])
226
+ result = self.query_data_source(current_data_source["ds_id"], data_source_cursor, filter=ds_filter)
227
+ for page in result.get("results", []):
228
+ records.append(SourceRecord(id=page["id"], data=page))
229
+
230
+ if result.get("has_more"):
231
+ return SourceIteration(
232
+ records=records,
233
+ next_pagination={
234
+ "remaining_data_sources": remaining_data_sources,
235
+ "current_data_source": current_data_source,
236
+ "data_source_cursor": result.get("next_cursor"),
237
+ "remaining_page_ids": remaining_page_ids,
238
+ "data_sources_loaded": True,
239
+ },
240
+ )
241
+ except Exception as e:
242
+ logger.error(f"Failed to query data source {current_data_source['ds_id']}: {e}")
243
+
244
+ # Process next data source
245
+ if remaining_data_sources:
246
+ ds_info = remaining_data_sources[0]
247
+ remaining_data_sources = remaining_data_sources[1:]
248
+
249
+ try:
250
+ ds_filter = self.get_filter_for_database(ds_info["db_id"])
251
+ result = self.query_data_source(ds_info["ds_id"], filter=ds_filter)
252
+ for page in result.get("results", []):
253
+ records.append(SourceRecord(id=page["id"], data=page))
254
+
255
+ if result.get("has_more"):
256
+ return SourceIteration(
257
+ records=records,
258
+ next_pagination={
259
+ "remaining_data_sources": remaining_data_sources,
260
+ "current_data_source": ds_info,
261
+ "data_source_cursor": result.get("next_cursor"),
262
+ "remaining_page_ids": remaining_page_ids,
263
+ "data_sources_loaded": True,
264
+ },
265
+ )
266
+
267
+ # If there are more data sources, continue
268
+ if remaining_data_sources:
269
+ return SourceIteration(
270
+ records=records,
271
+ next_pagination={
272
+ "remaining_data_sources": remaining_data_sources,
273
+ "current_data_source": None,
274
+ "data_source_cursor": None,
275
+ "remaining_page_ids": remaining_page_ids,
276
+ "data_sources_loaded": True,
277
+ },
278
+ )
279
+ except Exception as e:
280
+ logger.error(f"Failed to query data source {ds_info['ds_id']}: {e}")
281
+ # Continue with remaining data sources
282
+ if remaining_data_sources:
283
+ return SourceIteration(
284
+ records=records,
285
+ next_pagination={
286
+ "remaining_data_sources": remaining_data_sources,
287
+ "current_data_source": None,
288
+ "data_source_cursor": None,
289
+ "remaining_page_ids": remaining_page_ids,
290
+ "data_sources_loaded": True,
291
+ },
292
+ )
293
+
294
+ # Process individual page IDs
295
+ if remaining_page_ids:
296
+ page_id = remaining_page_ids[0]
297
+ remaining_page_ids = remaining_page_ids[1:]
298
+
299
+ try:
300
+ page_data = self.get_page(page_id)
301
+ records.append(SourceRecord(id=page_data["id"], data=page_data))
302
+ except Exception as e:
303
+ logger.error(f"Failed to fetch page {page_id}: {e}")
304
+
305
+ if remaining_page_ids:
306
+ return SourceIteration(
307
+ records=records,
308
+ next_pagination={
309
+ "remaining_data_sources": [],
310
+ "current_data_source": None,
311
+ "data_source_cursor": None,
312
+ "remaining_page_ids": remaining_page_ids,
313
+ "data_sources_loaded": True,
314
+ },
315
+ )
316
+
317
+ return SourceIteration(records=records, next_pagination={})
318
+
319
+ # ==================== BLOCKS STREAM ====================
320
+
321
+ def get_block_children(self, block_id: str, start_cursor: str = None) -> dict:
322
+ """Fetch children blocks of a block."""
323
+ params = {"page_size": self.config.page_size}
324
+ if start_cursor:
325
+ params["start_cursor"] = start_cursor
326
+
327
+ response = self.session.get(f"{BASE_URL}/blocks/{block_id}/children", params=params)
328
+ response.raise_for_status()
329
+ return response.json()
330
+
331
+ def get_pages_from_database(self, database_id: str, apply_filter: bool = False) -> List[str]:
332
+ """Get all page IDs from a database by querying its data sources.
333
+
334
+ Args:
335
+ database_id: The database ID to fetch pages from
336
+ apply_filter: Whether to apply database_filters config (False for inline databases)
337
+ """
338
+ page_ids = []
339
+ db_filter = self.get_filter_for_database(database_id) if apply_filter else None
340
+ try:
341
+ db_data = self.get_database(database_id)
342
+ if not db_data:
343
+ return page_ids
344
+ for ds in db_data.get("data_sources") or []:
345
+ cursor = None
346
+ while True:
347
+ result = self.query_data_source(ds["id"], cursor, filter=db_filter)
348
+ if not result:
349
+ break
350
+ for page in result.get("results") or []:
351
+ if page and page.get("id"):
352
+ page_ids.append(page["id"])
353
+ if result.get("has_more"):
354
+ cursor = result.get("next_cursor")
355
+ else:
356
+ break
357
+ except Exception as e:
358
+ logger.error(f"Failed to get pages from database {database_id}: {e}")
359
+ return page_ids
360
+
361
+ def fetch_blocks_recursively(
362
+ self,
363
+ block_id: str,
364
+ parent_input_database_id: Optional[str] = None,
365
+ parent_input_page_id: Optional[str] = None,
366
+ source_page_id: Optional[str] = None,
367
+ current_depth: int = 0,
368
+ fetch_child_databases: bool = True,
369
+ global_order_counter: Optional[List[int]] = None,
370
+ ) -> List[dict]:
371
+ """
372
+ Fetch all blocks under a block_id recursively.
373
+ Also fetches content from child_database blocks.
374
+
375
+ Args:
376
+ block_id: The block/page ID to fetch children from
377
+ parent_input_database_id: The original input database ID from config
378
+ parent_input_page_id: The original input page ID from config
379
+ source_page_id: The immediate page this block belongs to
380
+ current_depth: Current recursion depth (0 = top level)
381
+ fetch_child_databases: Whether to recurse into child_database blocks (disable for all_* streams)
382
+ global_order_counter: Mutable counter [int] for tracking reading order across all blocks in a page
383
+
384
+ Returns:
385
+ Flat list of all blocks with lineage tracking fields
386
+ """
387
+ # Initialize counter on first call
388
+ if global_order_counter is None:
389
+ global_order_counter = [0]
390
+ # Check recursion depth limit
391
+ if current_depth >= self.config.max_recursion_depth:
392
+ logger.warning(
393
+ f"Max recursion depth {self.config.max_recursion_depth} reached for block {block_id}, stopping recursion"
394
+ )
395
+ return []
396
+
397
+ all_blocks = []
398
+ cursor = None
399
+ block_order = 0 # Track position within parent
400
+
401
+ while True:
402
+ result = self.get_block_children(block_id, cursor)
403
+ if not result:
404
+ break
405
+
406
+ for block in result.get("results") or []:
407
+ if not block:
408
+ continue
409
+ # Add lineage tracking
410
+ block["parent_block_id"] = block_id
411
+ block["parent_input_database_id"] = parent_input_database_id
412
+ block["parent_input_page_id"] = parent_input_page_id
413
+ block["source_page_id"] = source_page_id
414
+ # Add depth and ordering
415
+ block["depth"] = current_depth
416
+ block["block_order"] = block_order
417
+ block["page_order"] = global_order_counter[0]
418
+ block_order += 1
419
+ global_order_counter[0] += 1
420
+
421
+ all_blocks.append(block)
422
+
423
+ # Handle child_database blocks - fetch their content in parallel
424
+ if (
425
+ block.get("type") == "child_database"
426
+ and self.config.fetch_blocks_recursively
427
+ and fetch_child_databases
428
+ ):
429
+ child_db_id = block.get("id")
430
+ logger.info(f"Found inline database {child_db_id} at depth {current_depth}, fetching its content")
431
+
432
+ try:
433
+ # Get all pages from the inline database
434
+ inline_page_ids = self.get_pages_from_database(child_db_id)
435
+
436
+ # Fetch blocks from inline pages in parallel
437
+ # Note: parallel execution means global_order_counter won't be sequential for inline DBs
438
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
439
+ futures = {
440
+ executor.submit(
441
+ self.fetch_blocks_recursively,
442
+ block_id=inline_page_id,
443
+ parent_input_database_id=parent_input_database_id,
444
+ parent_input_page_id=parent_input_page_id,
445
+ source_page_id=inline_page_id,
446
+ current_depth=current_depth + 1,
447
+ fetch_child_databases=fetch_child_databases,
448
+ global_order_counter=global_order_counter,
449
+ ): inline_page_id
450
+ for inline_page_id in inline_page_ids
451
+ }
452
+ for future in as_completed(futures):
453
+ try:
454
+ inline_blocks = future.result()
455
+ all_blocks.extend(inline_blocks)
456
+ except Exception as e:
457
+ page_id = futures[future]
458
+ logger.error(f"Failed to fetch blocks from inline page {page_id}: {e}")
459
+
460
+ except Exception as e:
461
+ logger.error(f"Failed to fetch content from inline database {child_db_id}: {e}")
462
+
463
+ # Recursively fetch children if block has children
464
+ # Skip child_page and child_database - they are references, not containers with inline content
465
+ elif (
466
+ block.get("has_children")
467
+ and self.config.fetch_blocks_recursively
468
+ and block.get("type") not in ("child_page", "child_database")
469
+ ):
470
+ try:
471
+ child_blocks = self.fetch_blocks_recursively(
472
+ block_id=block["id"],
473
+ parent_input_database_id=parent_input_database_id,
474
+ parent_input_page_id=parent_input_page_id,
475
+ source_page_id=source_page_id,
476
+ current_depth=current_depth + 1,
477
+ fetch_child_databases=fetch_child_databases,
478
+ global_order_counter=global_order_counter,
479
+ )
480
+ all_blocks.extend(child_blocks)
481
+ except Exception as e:
482
+ # synced_block can return 404 if original block is inaccessible
483
+ if block.get("type") == "synced_block":
484
+ logger.warning(f"Skipping synced_block {block.get('id')} - children inaccessible: {e}")
485
+ else:
486
+ logger.error(f"Failed to fetch children of block {block.get('id')}: {e}")
487
+
488
+ if result.get("has_more"):
489
+ cursor = result.get("next_cursor")
490
+ else:
491
+ break
492
+
493
+ return all_blocks
494
+
495
+ def get_blocks(self, pagination: dict = None) -> SourceIteration:
496
+ """
497
+ Fetch blocks from databases and pages.
498
+ Blocks are fetched recursively if fetch_blocks_recursively is True.
499
+ Also fetches content from inline databases (child_database blocks).
500
+ """
501
+ if pagination:
502
+ # Each item is: {"block_id": str, "input_db_id": str|None, "input_page_id": str|None, "source_page_id": str|None}
503
+ items_to_process = pagination.get("items_to_process", [])
504
+ items_loaded = pagination.get("items_loaded", False)
505
+ else:
506
+ items_to_process = []
507
+ items_loaded = False
508
+
509
+ # First, collect all database IDs and page IDs to fetch blocks from
510
+ if not items_loaded:
511
+ # Add configured page_ids (these are direct input pages)
512
+ for page_id in self.config.page_ids:
513
+ items_to_process.append(
514
+ {
515
+ "block_id": page_id,
516
+ "input_db_id": None,
517
+ "input_page_id": page_id,
518
+ "source_page_id": page_id,
519
+ }
520
+ )
521
+
522
+ # Collect pages from databases
523
+ for db_id in self.config.database_ids:
524
+ try:
525
+ db_filter = self.get_filter_for_database(db_id)
526
+ db_data = self.get_database(db_id)
527
+ for ds in db_data.get("data_sources", []):
528
+ cursor = None
529
+ while True:
530
+ result = self.query_data_source(ds["id"], cursor, filter=db_filter)
531
+ for page in result.get("results", []):
532
+ items_to_process.append(
533
+ {
534
+ "block_id": page["id"],
535
+ "input_db_id": db_id,
536
+ "input_page_id": None,
537
+ "source_page_id": page["id"],
538
+ }
539
+ )
540
+ if result.get("has_more"):
541
+ cursor = result.get("next_cursor")
542
+ else:
543
+ break
544
+ except Exception as e:
545
+ logger.error(f"Failed to collect pages from database {db_id}: {e}")
546
+
547
+ items_loaded = True
548
+
549
+ if not items_to_process:
550
+ return SourceIteration(records=[], next_pagination={})
551
+
552
+ # Process a batch in parallel
553
+ batch_size = self.config.max_workers
554
+ batch = items_to_process[:batch_size]
555
+ items_to_process = items_to_process[batch_size:]
556
+
557
+ records = []
558
+
559
+ def fetch_item_blocks(item_info: dict) -> List[dict]:
560
+ """Fetch all blocks for a database or page."""
561
+ return self.fetch_blocks_recursively(
562
+ block_id=item_info["block_id"],
563
+ parent_input_database_id=item_info["input_db_id"],
564
+ parent_input_page_id=item_info["input_page_id"],
565
+ source_page_id=item_info["source_page_id"],
566
+ )
567
+
568
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
569
+ futures = {executor.submit(fetch_item_blocks, item_info): item_info for item_info in batch}
570
+ for future in as_completed(futures):
571
+ item_info = futures[future]
572
+ try:
573
+ blocks = future.result()
574
+ for block in blocks:
575
+ records.append(SourceRecord(id=block["id"], data=block))
576
+ logger.info(f"Fetched {len(blocks)} blocks from {item_info['block_id']}")
577
+ except Exception as e:
578
+ logger.error(f"Failed to fetch blocks from {item_info['block_id']}: {e}")
579
+
580
+ next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
581
+
582
+ return SourceIteration(records=records, next_pagination=next_pagination)
583
+
584
+ # ==================== HELPERS ====================
585
+
586
+ def get_filter_for_database(self, database_id: str) -> Optional[dict]:
587
+ """Get the filter configured for a database, if any."""
588
+ return self.config.database_filters.get(database_id)
589
+
590
+ def _extract_title(self, database_data: dict) -> str:
591
+ """Extract plain text title from database object."""
592
+ title_parts = database_data.get("title", [])
593
+ return "".join(part.get("plain_text", "") for part in title_parts)
594
+
595
+ # ==================== MARKDOWN CONVERSION ====================
596
+
597
+ def _extract_rich_text(self, rich_text_array: List[dict]) -> str:
598
+ """Convert Notion rich text array to markdown string with formatting."""
599
+ if not rich_text_array:
600
+ return ""
601
+
602
+ result = []
603
+ for item in rich_text_array:
604
+ text = item.get("plain_text", "")
605
+ annotations = item.get("annotations", {})
606
+ href = item.get("href")
607
+
608
+ # Apply formatting in order: code, bold, italic, strikethrough
609
+ if annotations.get("code"):
610
+ text = f"`{text}`"
611
+ if annotations.get("bold"):
612
+ text = f"**{text}**"
613
+ if annotations.get("italic"):
614
+ text = f"*{text}*"
615
+ if annotations.get("strikethrough"):
616
+ text = f"~~{text}~~"
617
+ if href:
618
+ text = f"[{text}]({href})"
619
+
620
+ result.append(text)
621
+
622
+ return "".join(result)
623
+
624
+ def _block_to_markdown(self, block: dict) -> str:
625
+ """Convert a single Notion block to markdown string."""
626
+ block_type = block.get("type", "")
627
+ content = block.get(block_type) or {}
628
+
629
+ # Text blocks
630
+ if block_type == "paragraph":
631
+ return self._extract_rich_text(content.get("rich_text", []))
632
+
633
+ elif block_type == "heading_1":
634
+ return f"# {self._extract_rich_text(content.get('rich_text', []))}"
635
+
636
+ elif block_type == "heading_2":
637
+ return f"## {self._extract_rich_text(content.get('rich_text', []))}"
638
+
639
+ elif block_type == "heading_3":
640
+ return f"### {self._extract_rich_text(content.get('rich_text', []))}"
641
+
642
+ elif block_type == "bulleted_list_item":
643
+ return f"- {self._extract_rich_text(content.get('rich_text', []))}"
644
+
645
+ elif block_type == "numbered_list_item":
646
+ return f"1. {self._extract_rich_text(content.get('rich_text', []))}"
647
+
648
+ elif block_type == "to_do":
649
+ checkbox = "[x]" if content.get("checked") else "[ ]"
650
+ return f"- {checkbox} {self._extract_rich_text(content.get('rich_text', []))}"
651
+
652
+ elif block_type == "quote":
653
+ return f"> {self._extract_rich_text(content.get('rich_text', []))}"
654
+
655
+ elif block_type == "callout":
656
+ icon = content.get("icon") or {}
657
+ emoji = icon.get("emoji", "💡")
658
+ text = self._extract_rich_text(content.get("rich_text", []))
659
+ return f"> {emoji} {text}"
660
+
661
+ elif block_type == "code":
662
+ language = content.get("language", "")
663
+ code_text = self._extract_rich_text(content.get("rich_text", []))
664
+ return f"```{language}\n{code_text}\n```"
665
+
666
+ elif block_type == "equation":
667
+ return f"$$ {content.get('expression', '')} $$"
668
+
669
+ elif block_type == "divider":
670
+ return "---"
671
+
672
+ elif block_type == "toggle":
673
+ return f"<details><summary>{self._extract_rich_text(content.get('rich_text', []))}</summary></details>"
674
+
675
+ # Media blocks
676
+ elif block_type == "image":
677
+ url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
678
+ caption = self._extract_rich_text(content.get("caption", []))
679
+ return f"![{caption}]({url})"
680
+
681
+ elif block_type == "video":
682
+ url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
683
+ return f"[Video]({url})"
684
+
685
+ elif block_type == "file":
686
+ url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
687
+ caption = self._extract_rich_text(content.get("caption", [])) or "File"
688
+ return f"[{caption}]({url})"
689
+
690
+ elif block_type == "pdf":
691
+ url = content.get("external", {}).get("url") or content.get("file", {}).get("url", "")
692
+ return f"[PDF]({url})"
693
+
694
+ elif block_type == "bookmark":
695
+ url = content.get("url", "")
696
+ caption = self._extract_rich_text(content.get("caption", [])) or url
697
+ return f"[{caption}]({url})"
698
+
699
+ elif block_type == "embed":
700
+ return f"[Embed]({content.get('url', '')})"
701
+
702
+ elif block_type == "link_preview":
703
+ return f"[Link Preview]({content.get('url', '')})"
704
+
705
+ # Table blocks
706
+ elif block_type == "table":
707
+ return "[Table - see child blocks for rows]"
708
+
709
+ elif block_type == "table_row":
710
+ cells = content.get("cells", [])
711
+ row = " | ".join(self._extract_rich_text(cell) for cell in cells)
712
+ return f"| {row} |"
713
+
714
+ # Database/page references
715
+ elif block_type == "child_page":
716
+ return f"[Page: {content.get('title', 'Untitled')}]"
717
+
718
+ elif block_type == "child_database":
719
+ return f"[Database: {content.get('title', 'Untitled')}]"
720
+
721
+ elif block_type == "link_to_page":
722
+ page_id = content.get("page_id") or content.get("database_id", "")
723
+ return f"[Link to page: {page_id}]"
724
+
725
+ elif block_type == "table_of_contents":
726
+ return "[Table of Contents]"
727
+
728
+ elif block_type == "breadcrumb":
729
+ return "[Breadcrumb]"
730
+
731
+ elif block_type == "synced_block":
732
+ return "[Synced Block]"
733
+
734
+ elif block_type == "template":
735
+ return "[Template]"
736
+
737
+ elif block_type == "column_list":
738
+ return "" # Column list is just a container
739
+
740
+ elif block_type == "column":
741
+ return "" # Column is just a container
742
+
743
+ else:
744
+ return f"[Unsupported block type: {block_type}]"
745
+
746
+ def get_blocks_markdown(self, pagination: dict = None) -> SourceIteration:
747
+ """
748
+ Fetch blocks and convert them to markdown.
749
+ Returns one record per block with its markdown content.
750
+ """
751
+ if pagination:
752
+ items_to_process = pagination.get("items_to_process", [])
753
+ items_loaded = pagination.get("items_loaded", False)
754
+ else:
755
+ items_to_process = []
756
+ items_loaded = False
757
+
758
+ # Collect all database IDs and page IDs to fetch blocks from
759
+ if not items_loaded:
760
+ for page_id in self.config.page_ids:
761
+ items_to_process.append(
762
+ {
763
+ "block_id": page_id,
764
+ "input_db_id": None,
765
+ "input_page_id": page_id,
766
+ "source_page_id": page_id,
767
+ }
768
+ )
769
+
770
+ for db_id in self.config.database_ids:
771
+ # Collect pages from database's data_sources
772
+ try:
773
+ db_filter = self.get_filter_for_database(db_id)
774
+ db_data = self.get_database(db_id)
775
+ for ds in db_data.get("data_sources", []):
776
+ cursor = None
777
+ while True:
778
+ result = self.query_data_source(ds["id"], cursor, filter=db_filter)
779
+ for page in result.get("results", []):
780
+ items_to_process.append(
781
+ {
782
+ "block_id": page["id"],
783
+ "input_db_id": db_id,
784
+ "input_page_id": None,
785
+ "source_page_id": page["id"],
786
+ }
787
+ )
788
+ if result.get("has_more"):
789
+ cursor = result.get("next_cursor")
790
+ else:
791
+ break
792
+ except Exception as e:
793
+ logger.error(f"Failed to collect pages from database {db_id}: {e}")
794
+
795
+ items_loaded = True
796
+
797
+ if not items_to_process:
798
+ return SourceIteration(records=[], next_pagination={})
799
+
800
+ # Process a batch in parallel
801
+ batch_size = self.config.max_workers
802
+ batch = items_to_process[:batch_size]
803
+ items_to_process = items_to_process[batch_size:]
804
+
805
+ records = []
806
+
807
+ def fetch_and_convert_item(item_info: dict) -> List[dict]:
808
+ """Fetch blocks for a database or page and convert each to markdown."""
809
+ blocks = self.fetch_blocks_recursively(
810
+ block_id=item_info["block_id"],
811
+ parent_input_database_id=item_info["input_db_id"],
812
+ parent_input_page_id=item_info["input_page_id"],
813
+ source_page_id=item_info["source_page_id"],
814
+ fetch_child_databases=False,
815
+ )
816
+
817
+ # Convert each block to markdown record
818
+ block_records = []
819
+ for block in blocks or []:
820
+ if not block:
821
+ continue
822
+ md = self._block_to_markdown(block)
823
+ block_records.append(
824
+ {
825
+ "block_id": block.get("id"),
826
+ "block_type": block.get("type"),
827
+ "markdown": md,
828
+ "source_page_id": block.get("source_page_id"),
829
+ "parent_block_id": block.get("parent_block_id"),
830
+ "parent_input_database_id": block.get("parent_input_database_id"),
831
+ "parent_input_page_id": block.get("parent_input_page_id"),
832
+ "depth": block.get("depth"),
833
+ "block_order": block.get("block_order"),
834
+ "page_order": block.get("page_order"),
835
+ "block_raw": block,
836
+ }
837
+ )
838
+ return block_records
839
+
840
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
841
+ futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
842
+ for future in as_completed(futures):
843
+ item_info = futures[future]
844
+ try:
845
+ block_records = future.result()
846
+ for block_record in block_records:
847
+ records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
848
+ logger.info(f"Converted {len(block_records)} blocks to markdown from {item_info['block_id']}")
849
+ except Exception as e:
850
+ import traceback
851
+
852
+ logger.error(
853
+ f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}\n{traceback.format_exc()}"
854
+ )
855
+
856
+ next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
857
+
858
+ return SourceIteration(records=records, next_pagination=next_pagination)
859
+
860
+ # ==================== SEARCH API (ALL_* STREAMS) ====================
861
+
862
+ def search(self, start_cursor: str = None) -> dict:
863
+ """
864
+ Search all pages and databases accessible to the integration.
865
+
866
+ Args:
867
+ start_cursor: Pagination cursor
868
+
869
+ Returns:
870
+ Raw search results (filter client-side by object type)
871
+ """
872
+ payload = {"page_size": self.config.page_size}
873
+ if start_cursor:
874
+ payload["start_cursor"] = start_cursor
875
+
876
+ response = self.session.post(f"{BASE_URL}/search", json=payload)
877
+ response.raise_for_status()
878
+ return response.json()
879
+
880
+ def search_by_type(self, object_type: str, start_cursor: str = None) -> dict:
881
+ """
882
+ Search and filter by object type client-side.
883
+
884
+ Args:
885
+ object_type: "page" or "database"
886
+ start_cursor: Pagination cursor
887
+
888
+ Returns:
889
+ Filtered results matching object_type
890
+ """
891
+ result = self.search(start_cursor=start_cursor)
892
+
893
+ # Filter results by object type
894
+ filtered_results = [item for item in result.get("results", []) if item.get("object") == object_type]
895
+
896
+ return {
897
+ "results": filtered_results,
898
+ "has_more": result.get("has_more", False),
899
+ "next_cursor": result.get("next_cursor"),
900
+ }
901
+
902
+ def get_all_databases(self, pagination: dict = None) -> SourceIteration:
903
+ """
904
+ Fetch all databases accessible to the integration.
905
+ In 2025-09-03 API, we get data_sources from search and fetch their parent databases.
906
+ """
907
+ if pagination:
908
+ db_ids_to_fetch = pagination.get("db_ids_to_fetch", [])
909
+ dbs_loaded = pagination.get("dbs_loaded", False)
910
+ else:
911
+ db_ids_to_fetch = []
912
+ dbs_loaded = False
913
+
914
+ # Collect unique database IDs from data_sources
915
+ if not dbs_loaded:
916
+ seen_db_ids = set()
917
+ search_cursor = None
918
+
919
+ while True:
920
+ result = self.search_by_type(object_type="data_source", start_cursor=search_cursor)
921
+ for ds in result.get("results", []):
922
+ # Data sources have a parent.database_id
923
+ parent = ds.get("parent", {})
924
+ if parent.get("type") == "database_id":
925
+ db_id = parent.get("database_id")
926
+ if db_id and db_id not in seen_db_ids:
927
+ seen_db_ids.add(db_id)
928
+ db_ids_to_fetch.append(db_id)
929
+
930
+ if result.get("has_more"):
931
+ search_cursor = result.get("next_cursor")
932
+ else:
933
+ break
934
+
935
+ dbs_loaded = True
936
+ logger.info(f"Found {len(db_ids_to_fetch)} unique databases from data_sources")
937
+
938
+ if not db_ids_to_fetch:
939
+ return SourceIteration(records=[], next_pagination={})
940
+
941
+ # Fetch one database at a time
942
+ db_id = db_ids_to_fetch[0]
943
+ db_ids_to_fetch = db_ids_to_fetch[1:]
944
+
945
+ records = []
946
+ try:
947
+ db_data = self.get_database(db_id)
948
+ records.append(SourceRecord(id=db_data["id"], data=db_data))
949
+ except Exception as e:
950
+ logger.error(f"Failed to fetch database {db_id}: {e}")
951
+
952
+ next_pagination = {"db_ids_to_fetch": db_ids_to_fetch, "dbs_loaded": True} if db_ids_to_fetch else {}
953
+
954
+ return SourceIteration(records=records, next_pagination=next_pagination)
955
+
956
+ def get_all_data_sources(self, pagination: dict = None) -> SourceIteration:
957
+ """Fetch all data_sources accessible to the integration."""
958
+ cursor = pagination.get("start_cursor") if pagination else None
959
+
960
+ result = self.search_by_type(object_type="data_source", start_cursor=cursor)
961
+
962
+ records = [SourceRecord(id=ds["id"], data=ds) for ds in result.get("results", [])]
963
+
964
+ next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
965
+
966
+ return SourceIteration(records=records, next_pagination=next_pagination)
967
+
968
+ def get_all_pages(self, pagination: dict = None) -> SourceIteration:
969
+ """Fetch all pages accessible to the integration."""
970
+ cursor = pagination.get("start_cursor") if pagination else None
971
+
972
+ result = self.search_by_type(object_type="page", start_cursor=cursor)
973
+
974
+ records = [SourceRecord(id=page["id"], data=page) for page in result.get("results", [])]
975
+
976
+ next_pagination = {"start_cursor": result.get("next_cursor")} if result.get("has_more") else {}
977
+
978
+ return SourceIteration(records=records, next_pagination=next_pagination)
979
+
980
+ def get_all_blocks_markdown(self, pagination: dict = None) -> SourceIteration:
981
+ """
982
+ Fetch all databases and pages accessible to the integration and convert their blocks to markdown.
983
+ Includes databases and pages from search API AND pages from all databases via data_sources.
984
+ """
985
+ if pagination:
986
+ items_to_process = pagination.get("items_to_process", [])
987
+ items_loaded = pagination.get("items_loaded", False)
988
+ else:
989
+ items_to_process = []
990
+ items_loaded = False
991
+
992
+ # Collect databases and pages from search API
993
+ if not items_loaded:
994
+ seen_ids = set()
995
+
996
+ # 1. Get pages from search API
997
+ search_cursor = None
998
+ while True:
999
+ result = self.search_by_type(object_type="page", start_cursor=search_cursor)
1000
+ for page in result.get("results", []):
1001
+ if page["id"] not in seen_ids:
1002
+ seen_ids.add(page["id"])
1003
+ items_to_process.append(
1004
+ {
1005
+ "block_id": page["id"],
1006
+ "input_db_id": None,
1007
+ "input_page_id": None,
1008
+ "source_page_id": page["id"],
1009
+ }
1010
+ )
1011
+
1012
+ if result.get("has_more"):
1013
+ search_cursor = result.get("next_cursor")
1014
+ else:
1015
+ break
1016
+
1017
+ logger.info(f"Found {len(items_to_process)} pages from search API")
1018
+
1019
+ # 2. Get all data_sources and their parent databases + query for pages
1020
+ ds_search_cursor = None
1021
+ while True:
1022
+ result = self.search_by_type(object_type="data_source", start_cursor=ds_search_cursor)
1023
+ for ds in result.get("results", []):
1024
+ ds_id = ds["id"]
1025
+ # Get parent database_id from data_source
1026
+ parent = ds.get("parent", {})
1027
+ parent_db_id = parent.get("database_id") if parent.get("type") == "database_id" else None
1028
+
1029
+ # Add the parent database to fetch its blocks (headers, descriptions, etc.)
1030
+ if parent_db_id and parent_db_id not in seen_ids:
1031
+ seen_ids.add(parent_db_id)
1032
+ items_to_process.append(
1033
+ {
1034
+ "block_id": parent_db_id,
1035
+ "input_db_id": parent_db_id,
1036
+ "input_page_id": None,
1037
+ "source_page_id": None,
1038
+ }
1039
+ )
1040
+
1041
+ try:
1042
+ # Query data_source for pages (no filter for all_* streams)
1043
+ ds_cursor = None
1044
+ while True:
1045
+ ds_result = self.query_data_source(ds_id, ds_cursor)
1046
+ for page in ds_result.get("results", []):
1047
+ if page["id"] not in seen_ids:
1048
+ seen_ids.add(page["id"])
1049
+ items_to_process.append(
1050
+ {
1051
+ "block_id": page["id"],
1052
+ "input_db_id": parent_db_id,
1053
+ "input_page_id": None,
1054
+ "source_page_id": page["id"],
1055
+ }
1056
+ )
1057
+ if ds_result.get("has_more"):
1058
+ ds_cursor = ds_result.get("next_cursor")
1059
+ else:
1060
+ break
1061
+ except Exception as e:
1062
+ logger.error(f"Failed to get pages from data_source {ds_id}: {e}")
1063
+
1064
+ if result.get("has_more"):
1065
+ ds_search_cursor = result.get("next_cursor")
1066
+ else:
1067
+ break
1068
+
1069
+ items_loaded = True
1070
+ logger.info(
1071
+ f"Total {len(items_to_process)} unique items (databases + pages) to process for all_blocks_markdown"
1072
+ )
1073
+
1074
+ if not items_to_process:
1075
+ return SourceIteration(records=[], next_pagination={})
1076
+
1077
+ # Process a batch in parallel
1078
+ batch_size = self.config.max_workers
1079
+ batch = items_to_process[:batch_size]
1080
+ items_to_process = items_to_process[batch_size:]
1081
+
1082
+ records = []
1083
+
1084
+ def fetch_and_convert_item(item_info: dict) -> List[dict]:
1085
+ """Fetch blocks for a database or page and convert each to markdown."""
1086
+ # fetch_child_databases=False because all_blocks_markdown already collects
1087
+ # all pages from all data_sources, so we don't need to recurse into child_database blocks
1088
+ blocks = self.fetch_blocks_recursively(
1089
+ block_id=item_info["block_id"],
1090
+ parent_input_database_id=item_info["input_db_id"],
1091
+ parent_input_page_id=item_info["input_page_id"],
1092
+ source_page_id=item_info["source_page_id"],
1093
+ fetch_child_databases=False,
1094
+ )
1095
+
1096
+ # Convert each block to markdown record
1097
+ block_records = []
1098
+ for block in blocks or []:
1099
+ if not block:
1100
+ continue
1101
+ md = self._block_to_markdown(block)
1102
+ block_records.append(
1103
+ {
1104
+ "block_id": block.get("id"),
1105
+ "block_type": block.get("type"),
1106
+ "markdown": md,
1107
+ "source_page_id": block.get("source_page_id"),
1108
+ "parent_block_id": block.get("parent_block_id"),
1109
+ "parent_input_database_id": block.get("parent_input_database_id"),
1110
+ "parent_input_page_id": block.get("parent_input_page_id"),
1111
+ "depth": block.get("depth"),
1112
+ "block_order": block.get("block_order"),
1113
+ "page_order": block.get("page_order"),
1114
+ "block_raw": block,
1115
+ }
1116
+ )
1117
+ return block_records
1118
+
1119
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
1120
+ futures = {executor.submit(fetch_and_convert_item, item_info): item_info for item_info in batch}
1121
+ for future in as_completed(futures):
1122
+ item_info = futures[future]
1123
+ try:
1124
+ block_records = future.result()
1125
+ for block_record in block_records:
1126
+ records.append(SourceRecord(id=block_record.get("block_id"), data=block_record))
1127
+ logger.info(f"Converted {len(block_records)} blocks to markdown from {item_info['block_id']}")
1128
+ except Exception as e:
1129
+ logger.error(f"Failed to fetch/convert blocks from {item_info['block_id']}: {e}")
1130
+
1131
+ next_pagination = {"items_to_process": items_to_process, "items_loaded": True} if items_to_process else {}
1132
+
1133
+ return SourceIteration(records=records, next_pagination=next_pagination)
1134
+
1135
+ # ==================== MAIN DISPATCH ====================
1136
+
1137
+ def get(self, pagination: dict = None) -> SourceIteration:
1138
+ if self.config.stream == NotionStreams.USERS:
1139
+ return self.get_users(pagination)
1140
+ elif self.config.stream == NotionStreams.DATABASES:
1141
+ return self.get_databases(pagination)
1142
+ elif self.config.stream == NotionStreams.DATA_SOURCES:
1143
+ return self.get_data_sources(pagination)
1144
+ elif self.config.stream == NotionStreams.PAGES:
1145
+ return self.get_pages(pagination)
1146
+ elif self.config.stream == NotionStreams.BLOCKS:
1147
+ return self.get_blocks(pagination)
1148
+ elif self.config.stream == NotionStreams.BLOCKS_MARKDOWN:
1149
+ return self.get_blocks_markdown(pagination)
1150
+ elif self.config.stream == NotionStreams.ALL_PAGES:
1151
+ return self.get_all_pages(pagination)
1152
+ elif self.config.stream == NotionStreams.ALL_DATABASES:
1153
+ return self.get_all_databases(pagination)
1154
+ elif self.config.stream == NotionStreams.ALL_DATA_SOURCES:
1155
+ return self.get_all_data_sources(pagination)
1156
+ elif self.config.stream == NotionStreams.ALL_BLOCKS_MARKDOWN:
1157
+ return self.get_all_blocks_markdown(pagination)
1158
+
1159
+ raise NotImplementedError(f"Stream {self.config.stream} not implemented for Notion")