unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
from typing import Any, Generator, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import notion_client.errors
|
|
5
|
+
from notion_client import Client as NotionClient
|
|
6
|
+
from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
|
|
7
|
+
from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint
|
|
8
|
+
from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
|
|
9
|
+
from notion_client.api_endpoints import Endpoint
|
|
10
|
+
from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionError, TimeoutError
|
|
13
|
+
from unstructured_ingest.processes.connectors.notion.ingest_backoff import RetryHandler
|
|
14
|
+
from unstructured_ingest.processes.connectors.notion.ingest_backoff.types import RetryStrategyConfig
|
|
15
|
+
from unstructured_ingest.processes.connectors.notion.types.block import Block
|
|
16
|
+
from unstructured_ingest.processes.connectors.notion.types.database import Database
|
|
17
|
+
from unstructured_ingest.processes.connectors.notion.types.database_properties import map_cells
|
|
18
|
+
from unstructured_ingest.processes.connectors.notion.types.page import Page
|
|
19
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@requires_dependencies(["httpx"], extras="notion")
|
|
23
|
+
def _get_retry_strategy(
|
|
24
|
+
endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig
|
|
25
|
+
) -> RetryHandler:
|
|
26
|
+
import backoff
|
|
27
|
+
import httpx
|
|
28
|
+
|
|
29
|
+
retryable_exceptions = (
|
|
30
|
+
httpx.TimeoutException,
|
|
31
|
+
httpx.HTTPStatusError,
|
|
32
|
+
notion_client.errors.HTTPResponseError,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return RetryHandler(
|
|
36
|
+
backoff.expo,
|
|
37
|
+
retryable_exceptions,
|
|
38
|
+
max_time=retry_strategy_config.max_retry_time,
|
|
39
|
+
max_tries=retry_strategy_config.max_retries,
|
|
40
|
+
logger=endpoint.parent.logger,
|
|
41
|
+
start_log_level=endpoint.parent.logger.level,
|
|
42
|
+
backoff_log_level=endpoint.parent.logger.level,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
|
|
47
|
+
if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
|
|
48
|
+
return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
*args,
|
|
56
|
+
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
57
|
+
**kwargs,
|
|
58
|
+
):
|
|
59
|
+
super().__init__(*args, **kwargs)
|
|
60
|
+
self.retry_strategy_config = retry_strategy_config
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def retry_handler(self) -> Optional[RetryHandler]:
|
|
64
|
+
return get_retry_handler(self)
|
|
65
|
+
|
|
66
|
+
def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]:
|
|
67
|
+
resp: dict = (
|
|
68
|
+
self.retry_handler(super().list, block_id=block_id, **kwargs)
|
|
69
|
+
if self.retry_handler
|
|
70
|
+
else super().list(block_id=block_id, **kwargs)
|
|
71
|
+
) # type: ignore
|
|
72
|
+
child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
|
|
73
|
+
return child_blocks, resp
|
|
74
|
+
|
|
75
|
+
def iterate_list(
|
|
76
|
+
self,
|
|
77
|
+
block_id: str,
|
|
78
|
+
**kwargs: Any,
|
|
79
|
+
) -> Generator[List[Block], None, None]:
|
|
80
|
+
next_cursor = None
|
|
81
|
+
while True:
|
|
82
|
+
response: dict = (
|
|
83
|
+
self.retry_handler(
|
|
84
|
+
super().list, block_id=block_id, start_cursor=next_cursor, **kwargs
|
|
85
|
+
)
|
|
86
|
+
if self.retry_handler
|
|
87
|
+
else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs)
|
|
88
|
+
) # type: ignore
|
|
89
|
+
child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])]
|
|
90
|
+
yield child_blocks
|
|
91
|
+
|
|
92
|
+
next_cursor = response.get("next_cursor")
|
|
93
|
+
if not response.get("has_more") or not next_cursor:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class DatabasesEndpoint(NotionDatabasesEndpoint):
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
*args,
|
|
101
|
+
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
102
|
+
**kwargs,
|
|
103
|
+
):
|
|
104
|
+
super().__init__(*args, **kwargs)
|
|
105
|
+
self.retry_strategy_config = retry_strategy_config
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def retry_handler(self) -> Optional[RetryHandler]:
|
|
109
|
+
return get_retry_handler(self)
|
|
110
|
+
|
|
111
|
+
def retrieve(self, database_id: str, **kwargs: Any) -> Database:
|
|
112
|
+
resp: dict = (
|
|
113
|
+
self.retry_handler(super().retrieve, database_id=database_id, **kwargs)
|
|
114
|
+
if (self.retry_handler)
|
|
115
|
+
else (super().retrieve(database_id=database_id, **kwargs))
|
|
116
|
+
) # type: ignore
|
|
117
|
+
return Database.from_dict(data=resp)
|
|
118
|
+
|
|
119
|
+
@requires_dependencies(["httpx"], extras="notion")
|
|
120
|
+
def retrieve_status(self, database_id: str, **kwargs) -> int:
|
|
121
|
+
import httpx
|
|
122
|
+
|
|
123
|
+
request = self.parent._build_request(
|
|
124
|
+
method="HEAD",
|
|
125
|
+
path=f"databases/{database_id}",
|
|
126
|
+
auth=kwargs.get("auth"),
|
|
127
|
+
)
|
|
128
|
+
try:
|
|
129
|
+
response: httpx.Response = (
|
|
130
|
+
self.retry_handler(self.parent.client.send, request)
|
|
131
|
+
if (self.retry_handler)
|
|
132
|
+
else (self.parent.client.send(request))
|
|
133
|
+
) # type: ignore
|
|
134
|
+
return response.status_code
|
|
135
|
+
except httpx.TimeoutException as e:
|
|
136
|
+
raise TimeoutError(str(e))
|
|
137
|
+
|
|
138
|
+
def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]:
|
|
139
|
+
"""Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database.
|
|
140
|
+
|
|
141
|
+
*[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)*
|
|
142
|
+
""" # noqa: E501
|
|
143
|
+
resp: dict = (
|
|
144
|
+
self.retry_handler(super().query, database_id=database_id, **kwargs)
|
|
145
|
+
if (self.retry_handler)
|
|
146
|
+
else (super().query(database_id=database_id, **kwargs))
|
|
147
|
+
) # type: ignore
|
|
148
|
+
pages = [Page.from_dict(data=p) for p in resp.pop("results")]
|
|
149
|
+
for p in pages:
|
|
150
|
+
p.properties = map_cells(p.properties)
|
|
151
|
+
return pages, resp
|
|
152
|
+
|
|
153
|
+
def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]:
|
|
154
|
+
next_cursor = None
|
|
155
|
+
while True:
|
|
156
|
+
response: dict = (
|
|
157
|
+
self.retry_handler(
|
|
158
|
+
super().query, database_id=database_id, start_cursor=next_cursor, **kwargs
|
|
159
|
+
)
|
|
160
|
+
if (self.retry_handler)
|
|
161
|
+
else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs))
|
|
162
|
+
) # type: ignore
|
|
163
|
+
pages = [Page.from_dict(data=p) for p in response.pop("results", [])]
|
|
164
|
+
for p in pages:
|
|
165
|
+
p.properties = map_cells(p.properties)
|
|
166
|
+
yield pages
|
|
167
|
+
|
|
168
|
+
next_cursor = response.get("next_cursor")
|
|
169
|
+
if not response.get("has_more") or not next_cursor:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class BlocksEndpoint(NotionBlocksEndpoint):
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
*args: Any,
|
|
177
|
+
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
178
|
+
**kwargs: Any,
|
|
179
|
+
) -> None:
|
|
180
|
+
super().__init__(*args, **kwargs)
|
|
181
|
+
self.retry_strategy_config = retry_strategy_config
|
|
182
|
+
self.children = BlocksChildrenEndpoint(
|
|
183
|
+
retry_strategy_config=retry_strategy_config,
|
|
184
|
+
*args,
|
|
185
|
+
**kwargs,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def retry_handler(self) -> Optional[RetryHandler]:
|
|
190
|
+
return get_retry_handler(self)
|
|
191
|
+
|
|
192
|
+
def retrieve(self, block_id: str, **kwargs: Any) -> Block:
|
|
193
|
+
resp: dict = (
|
|
194
|
+
self.retry_handler(super().retrieve, block_id=block_id, **kwargs)
|
|
195
|
+
if (self.retry_handler)
|
|
196
|
+
else (super().retrieve(block_id=block_id, **kwargs))
|
|
197
|
+
) # type: ignore
|
|
198
|
+
return Block.from_dict(data=resp)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class PagesEndpoint(NotionPagesEndpoint):
|
|
202
|
+
def __init__(
|
|
203
|
+
self,
|
|
204
|
+
*args,
|
|
205
|
+
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
206
|
+
**kwargs,
|
|
207
|
+
):
|
|
208
|
+
super().__init__(*args, **kwargs)
|
|
209
|
+
self.retry_strategy_config = retry_strategy_config
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def retry_handler(self) -> Optional[RetryHandler]:
|
|
213
|
+
return get_retry_handler(self)
|
|
214
|
+
|
|
215
|
+
def retrieve(self, page_id: str, **kwargs: Any) -> Page:
|
|
216
|
+
resp: dict = (
|
|
217
|
+
self.retry_handler(super().retrieve, page_id=page_id, **kwargs)
|
|
218
|
+
if (self.retry_handler)
|
|
219
|
+
else (super().retrieve(page_id=page_id, **kwargs))
|
|
220
|
+
) # type: ignore
|
|
221
|
+
return Page.from_dict(data=resp)
|
|
222
|
+
|
|
223
|
+
@requires_dependencies(["httpx"], extras="notion")
|
|
224
|
+
def retrieve_status(self, page_id: str, **kwargs) -> int:
|
|
225
|
+
import httpx
|
|
226
|
+
|
|
227
|
+
request = self.parent._build_request(
|
|
228
|
+
method="HEAD",
|
|
229
|
+
path=f"pages/{page_id}",
|
|
230
|
+
auth=kwargs.get("auth"),
|
|
231
|
+
)
|
|
232
|
+
try:
|
|
233
|
+
response: httpx.Response = (
|
|
234
|
+
self.retry_handler(self.parent.client.send, request)
|
|
235
|
+
if (self.retry_handler)
|
|
236
|
+
else (self.parent.client.send(request))
|
|
237
|
+
) # type: ignore
|
|
238
|
+
return response.status_code
|
|
239
|
+
except httpx.TimeoutException as e:
|
|
240
|
+
raise TimeoutError(str(e))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class Client(NotionClient):
|
|
244
|
+
def __init__(
|
|
245
|
+
self,
|
|
246
|
+
*args: Any,
|
|
247
|
+
retry_strategy_config: Optional[RetryStrategyConfig] = None,
|
|
248
|
+
**kwargs: Any,
|
|
249
|
+
) -> None:
|
|
250
|
+
super().__init__(*args, **kwargs)
|
|
251
|
+
self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
|
|
252
|
+
self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
|
|
253
|
+
self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
|
|
257
|
+
def __init__(self, *args, **kwargs):
|
|
258
|
+
super().__init__(*args, **kwargs)
|
|
259
|
+
self._http_client = httpx.AsyncClient()
|
|
260
|
+
|
|
261
|
+
async def list(self, block_id: str, **kwargs: Any) -> tuple[List[Block], dict]:
|
|
262
|
+
"""Fetch the list of child blocks asynchronously."""
|
|
263
|
+
try:
|
|
264
|
+
response = await self._http_client.get(
|
|
265
|
+
f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs
|
|
266
|
+
)
|
|
267
|
+
response.raise_for_status()
|
|
268
|
+
except httpx.HTTPStatusError as e:
|
|
269
|
+
raise SourceConnectionError(f"Failed to list blocks: {str(e)}")
|
|
270
|
+
except httpx.TimeoutException as e:
|
|
271
|
+
raise TimeoutError(str(e))
|
|
272
|
+
|
|
273
|
+
resp = response.json()
|
|
274
|
+
child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
|
|
275
|
+
return child_blocks, resp
|
|
276
|
+
|
|
277
|
+
async def iterate_list(
|
|
278
|
+
self, block_id: str, **kwargs: Any
|
|
279
|
+
) -> Generator[List[Block], None, None]:
|
|
280
|
+
"""Fetch the list of child blocks in pages asynchronously."""
|
|
281
|
+
next_cursor = None
|
|
282
|
+
while True:
|
|
283
|
+
params = {"start_cursor": next_cursor} if next_cursor else {}
|
|
284
|
+
params.update(kwargs)
|
|
285
|
+
child_blocks, response = await self.list(block_id, **params)
|
|
286
|
+
yield child_blocks
|
|
287
|
+
|
|
288
|
+
next_cursor = response.get("next_cursor")
|
|
289
|
+
if not response.get("has_more") or not next_cursor:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
async def close(self):
|
|
293
|
+
"""Close the HTTP client."""
|
|
294
|
+
await self._http_client.aclose()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class AsyncDatabasesEndpoint(NotionDatabasesEndpoint):
|
|
298
|
+
def __init__(self, *args, **kwargs):
|
|
299
|
+
super().__init__(*args, **kwargs)
|
|
300
|
+
self._http_client = httpx.AsyncClient()
|
|
301
|
+
|
|
302
|
+
async def retrieve(self, database_id: str, **kwargs: Any) -> Database:
|
|
303
|
+
"""Fetch a database by its ID asynchronously."""
|
|
304
|
+
try:
|
|
305
|
+
response = await self._http_client.get(
|
|
306
|
+
f"{self.parent._api_base}/databases/{database_id}", **kwargs
|
|
307
|
+
)
|
|
308
|
+
response.raise_for_status()
|
|
309
|
+
except httpx.HTTPStatusError as e:
|
|
310
|
+
raise SourceConnectionError(f"Failed to retrieve database: {str(e)}")
|
|
311
|
+
except httpx.TimeoutException as e:
|
|
312
|
+
raise TimeoutError(str(e))
|
|
313
|
+
|
|
314
|
+
return Database.from_dict(data=response.json())
|
|
315
|
+
|
|
316
|
+
async def query(self, database_id: str, **kwargs: Any) -> tuple[List[Page], dict]:
|
|
317
|
+
"""Query a database asynchronously."""
|
|
318
|
+
try:
|
|
319
|
+
response = await self._http_client.post(
|
|
320
|
+
f"{self.parent._api_base}/databases/{database_id}/query",
|
|
321
|
+
json=kwargs.get("json", {}),
|
|
322
|
+
)
|
|
323
|
+
response.raise_for_status()
|
|
324
|
+
except httpx.HTTPStatusError as e:
|
|
325
|
+
raise SourceConnectionError(f"Failed to query database: {str(e)}")
|
|
326
|
+
except httpx.TimeoutException as e:
|
|
327
|
+
raise TimeoutError(str(e))
|
|
328
|
+
|
|
329
|
+
resp = response.json()
|
|
330
|
+
pages = [Page.from_dict(data=p) for p in resp.pop("results", [])]
|
|
331
|
+
for p in pages:
|
|
332
|
+
p.properties = map_cells(p.properties)
|
|
333
|
+
return pages, resp
|
|
334
|
+
|
|
335
|
+
async def close(self):
|
|
336
|
+
"""Close the HTTP client."""
|
|
337
|
+
await self._http_client.aclose()
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class AsyncClient(NotionClient):
|
|
341
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
342
|
+
super().__init__(*args, **kwargs)
|
|
343
|
+
self.blocks = AsyncBlocksChildrenEndpoint(parent=self)
|
|
344
|
+
self.databases = AsyncDatabasesEndpoint(parent=self)
|
|
345
|
+
|
|
346
|
+
async def close(self):
|
|
347
|
+
"""Close all async endpoints."""
|
|
348
|
+
await self.blocks.close()
|
|
349
|
+
await self.databases.close()
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from time import time
|
|
3
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import UUID4, Field, Secret
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.data_types.file_data import (
|
|
8
|
+
FileData,
|
|
9
|
+
FileDataSourceMetadata,
|
|
10
|
+
SourceIdentifiers,
|
|
11
|
+
)
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionError, ValueError
|
|
13
|
+
from unstructured_ingest.interfaces import (
|
|
14
|
+
AccessConfig,
|
|
15
|
+
ConnectionConfig,
|
|
16
|
+
Downloader,
|
|
17
|
+
DownloaderConfig,
|
|
18
|
+
DownloadResponse,
|
|
19
|
+
Indexer,
|
|
20
|
+
IndexerConfig,
|
|
21
|
+
)
|
|
22
|
+
from unstructured_ingest.logger import logger
|
|
23
|
+
from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
|
|
24
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from unstructured_ingest.processes.connectors.notion.client import Client
|
|
28
|
+
|
|
29
|
+
NOTION_API_VERSION = "2022-06-28"
|
|
30
|
+
CONNECTOR_TYPE = "notion"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NotionAccessConfig(AccessConfig):
|
|
34
|
+
notion_api_key: str = Field(description="Notion API key")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class NotionConnectionConfig(ConnectionConfig):
|
|
38
|
+
access_config: Secret[NotionAccessConfig]
|
|
39
|
+
|
|
40
|
+
@requires_dependencies(["notion_client"], extras="notion")
|
|
41
|
+
def get_client(self) -> "Client":
|
|
42
|
+
from unstructured_ingest.processes.connectors.notion.client import Client
|
|
43
|
+
|
|
44
|
+
return Client(
|
|
45
|
+
notion_version=NOTION_API_VERSION,
|
|
46
|
+
auth=self.access_config.get_secret_value().notion_api_key,
|
|
47
|
+
logger=logger,
|
|
48
|
+
log_level=logger.level,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class NotionIndexerConfig(IndexerConfig):
|
|
53
|
+
page_ids: Optional[list[str]] = Field(
|
|
54
|
+
default=None, description="List of Notion page IDs to process"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
database_ids: Optional[list[str]] = Field(
|
|
58
|
+
default=None, description="List of Notion database IDs to process"
|
|
59
|
+
)
|
|
60
|
+
recursive: bool = Field(
|
|
61
|
+
default=False, description="Recursively process child pages and databases"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def __post_init__(self):
|
|
65
|
+
if self.page_ids:
|
|
66
|
+
self.page_ids: list[UUID4] = [UUID4(p.strip()) for p in self.page_ids]
|
|
67
|
+
|
|
68
|
+
if self.database_ids:
|
|
69
|
+
self.database_ids: list[UUID4] = [UUID4(p.strip()) for p in self.database_ids]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class NotionIndexer(Indexer):
|
|
74
|
+
connection_config: NotionConnectionConfig
|
|
75
|
+
index_config: NotionIndexerConfig
|
|
76
|
+
|
|
77
|
+
def is_async(self) -> bool:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def precheck(self) -> None:
|
|
81
|
+
"""Check the connection to the Notion API."""
|
|
82
|
+
try:
|
|
83
|
+
client = self.connection_config.get_client()
|
|
84
|
+
# Perform a simple request to verify connection
|
|
85
|
+
request = client._build_request("HEAD", "users")
|
|
86
|
+
response = client.client.send(request)
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Failed to validate connection: {e}", exc_info=True)
|
|
91
|
+
raise SourceConnectionError(f"Failed to validate connection: {e}")
|
|
92
|
+
|
|
93
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
94
|
+
client = self.connection_config.get_client()
|
|
95
|
+
processed_pages: set[str] = set()
|
|
96
|
+
processed_databases: set[str] = set()
|
|
97
|
+
|
|
98
|
+
pages_to_process: set[str] = set(self.index_config.page_ids or [])
|
|
99
|
+
databases_to_process: set[str] = set(self.index_config.database_ids or [])
|
|
100
|
+
|
|
101
|
+
while pages_to_process or databases_to_process:
|
|
102
|
+
# Process pages
|
|
103
|
+
for page_id in list(pages_to_process):
|
|
104
|
+
if page_id in processed_pages:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
processed_pages.add(page_id)
|
|
108
|
+
pages_to_process.remove(page_id)
|
|
109
|
+
file_data = self.get_page_file_data(page_id=page_id, client=client)
|
|
110
|
+
if file_data:
|
|
111
|
+
yield file_data
|
|
112
|
+
|
|
113
|
+
if self.index_config.recursive:
|
|
114
|
+
(child_pages, child_databases) = self.get_child_pages_and_databases(
|
|
115
|
+
page_id=page_id,
|
|
116
|
+
client=client,
|
|
117
|
+
processed_pages=processed_pages,
|
|
118
|
+
processed_databases=processed_databases,
|
|
119
|
+
)
|
|
120
|
+
pages_to_process.update(child_pages)
|
|
121
|
+
databases_to_process.update(child_databases)
|
|
122
|
+
|
|
123
|
+
# Process databases
|
|
124
|
+
for database_id in list(databases_to_process):
|
|
125
|
+
if database_id in processed_databases:
|
|
126
|
+
continue
|
|
127
|
+
processed_databases.add(database_id)
|
|
128
|
+
databases_to_process.remove(database_id)
|
|
129
|
+
file_data = self.get_database_file_data(database_id=database_id, client=client)
|
|
130
|
+
if file_data:
|
|
131
|
+
yield file_data
|
|
132
|
+
if self.index_config.recursive:
|
|
133
|
+
(
|
|
134
|
+
child_pages,
|
|
135
|
+
child_databases,
|
|
136
|
+
) = self.get_child_pages_and_databases_from_database(
|
|
137
|
+
database_id=database_id,
|
|
138
|
+
client=client,
|
|
139
|
+
processed_pages=processed_pages,
|
|
140
|
+
processed_databases=processed_databases,
|
|
141
|
+
)
|
|
142
|
+
pages_to_process.update(child_pages)
|
|
143
|
+
databases_to_process.update(child_databases)
|
|
144
|
+
|
|
145
|
+
@requires_dependencies(["notion_client"], extras="notion")
|
|
146
|
+
def get_page_file_data(self, page_id: str, client: "Client") -> Optional[FileData]:
|
|
147
|
+
try:
|
|
148
|
+
page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore
|
|
149
|
+
date_created = page_metadata.created_time
|
|
150
|
+
date_modified = page_metadata.last_edited_time
|
|
151
|
+
identifier = page_id
|
|
152
|
+
source_identifiers = SourceIdentifiers(
|
|
153
|
+
filename=f"{page_id}.html",
|
|
154
|
+
fullpath=f"{page_id}.html",
|
|
155
|
+
rel_path=f"{page_id}.html",
|
|
156
|
+
)
|
|
157
|
+
metadata = FileDataSourceMetadata(
|
|
158
|
+
date_created=date_created,
|
|
159
|
+
date_modified=date_modified,
|
|
160
|
+
record_locator={"page_id": page_id},
|
|
161
|
+
date_processed=str(time()),
|
|
162
|
+
)
|
|
163
|
+
# additional_metadata = page_metadata
|
|
164
|
+
additional_metadata = {
|
|
165
|
+
"created_by": page_metadata.created_by,
|
|
166
|
+
"last_edited_by": page_metadata.last_edited_by,
|
|
167
|
+
"parent": page_metadata.parent,
|
|
168
|
+
"url": page_metadata.url,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return FileData(
|
|
172
|
+
identifier=identifier,
|
|
173
|
+
connector_type=CONNECTOR_TYPE,
|
|
174
|
+
source_identifiers=source_identifiers,
|
|
175
|
+
metadata=metadata,
|
|
176
|
+
additional_metadata=additional_metadata,
|
|
177
|
+
display_name=source_identifiers.fullpath,
|
|
178
|
+
)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.error(f"Error retrieving page {page_id}: {e}")
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
@requires_dependencies(["notion_client"], extras="notion")
|
|
184
|
+
def get_database_file_data(self, database_id: str, client: "Client") -> Optional[FileData]:
|
|
185
|
+
try:
|
|
186
|
+
# type: ignore
|
|
187
|
+
database_metadata = client.databases.retrieve(database_id=database_id)
|
|
188
|
+
date_created = database_metadata.created_time
|
|
189
|
+
date_modified = database_metadata.last_edited_time
|
|
190
|
+
identifier = database_id
|
|
191
|
+
source_identifiers = SourceIdentifiers(
|
|
192
|
+
filename=f"{database_id}.html",
|
|
193
|
+
fullpath=f"{database_id}.html",
|
|
194
|
+
rel_path=f"{database_id}.html",
|
|
195
|
+
)
|
|
196
|
+
metadata = FileDataSourceMetadata(
|
|
197
|
+
date_created=date_created,
|
|
198
|
+
date_modified=date_modified,
|
|
199
|
+
record_locator={"database_id": database_id},
|
|
200
|
+
date_processed=str(time()),
|
|
201
|
+
)
|
|
202
|
+
additional_metadata = {
|
|
203
|
+
"created_by": database_metadata.created_by,
|
|
204
|
+
"last_edited_by": database_metadata.last_edited_by,
|
|
205
|
+
"parent": database_metadata.parent,
|
|
206
|
+
"url": database_metadata.url,
|
|
207
|
+
}
|
|
208
|
+
return FileData(
|
|
209
|
+
identifier=identifier,
|
|
210
|
+
connector_type=CONNECTOR_TYPE,
|
|
211
|
+
source_identifiers=source_identifiers,
|
|
212
|
+
metadata=metadata,
|
|
213
|
+
additional_metadata=additional_metadata,
|
|
214
|
+
display_name=source_identifiers.fullpath,
|
|
215
|
+
)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Error retrieving database {database_id}: {e}")
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
def get_child_pages_and_databases(
|
|
221
|
+
self,
|
|
222
|
+
page_id: str,
|
|
223
|
+
client: "Client",
|
|
224
|
+
processed_pages: set[str],
|
|
225
|
+
processed_databases: set[str],
|
|
226
|
+
) -> tuple[set[str], set[str]]:
|
|
227
|
+
from unstructured_ingest.processes.connectors.notion.helpers import (
|
|
228
|
+
get_recursive_content_from_page,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
child_content = get_recursive_content_from_page(
|
|
232
|
+
client=client,
|
|
233
|
+
page_id=page_id,
|
|
234
|
+
logger=logger,
|
|
235
|
+
)
|
|
236
|
+
child_pages = set(child_content.child_pages) - processed_pages
|
|
237
|
+
child_databases = set(child_content.child_databases) - processed_databases
|
|
238
|
+
return child_pages, child_databases
|
|
239
|
+
|
|
240
|
+
def get_child_pages_and_databases_from_database(
|
|
241
|
+
self,
|
|
242
|
+
database_id: str,
|
|
243
|
+
client: "Client",
|
|
244
|
+
processed_pages: set[str],
|
|
245
|
+
processed_databases: set[str],
|
|
246
|
+
) -> tuple[set[str], set[str]]:
|
|
247
|
+
from unstructured_ingest.processes.connectors.notion.helpers import (
|
|
248
|
+
get_recursive_content_from_database,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
child_content = get_recursive_content_from_database(
|
|
252
|
+
client=client,
|
|
253
|
+
database_id=database_id,
|
|
254
|
+
logger=logger,
|
|
255
|
+
)
|
|
256
|
+
child_pages = set(child_content.child_pages) - processed_pages
|
|
257
|
+
child_databases = set(child_content.child_databases) - processed_databases
|
|
258
|
+
return child_pages, child_databases
|
|
259
|
+
|
|
260
|
+
async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]:
|
|
261
|
+
# Asynchronous run is not implemented
|
|
262
|
+
raise NotImplementedError()
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class NotionDownloaderConfig(DownloaderConfig):
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@dataclass
|
|
270
|
+
class NotionDownloader(Downloader):
|
|
271
|
+
connection_config: NotionConnectionConfig
|
|
272
|
+
download_config: NotionDownloaderConfig
|
|
273
|
+
connector_type: str = CONNECTOR_TYPE
|
|
274
|
+
|
|
275
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
276
|
+
client = self.connection_config.get_client()
|
|
277
|
+
record_locator = file_data.metadata.record_locator
|
|
278
|
+
|
|
279
|
+
if "page_id" in record_locator:
|
|
280
|
+
return self.download_page(
|
|
281
|
+
client=client,
|
|
282
|
+
page_id=record_locator["page_id"],
|
|
283
|
+
file_data=file_data,
|
|
284
|
+
)
|
|
285
|
+
elif "database_id" in record_locator:
|
|
286
|
+
return self.download_database(
|
|
287
|
+
client=client,
|
|
288
|
+
database_id=record_locator["database_id"],
|
|
289
|
+
file_data=file_data,
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
raise ValueError("Invalid record_locator in file_data")
|
|
293
|
+
|
|
294
|
+
def download_page(self, client, page_id: str, file_data: FileData) -> DownloadResponse:
|
|
295
|
+
from unstructured_ingest.processes.connectors.notion.helpers import extract_page_html
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
text_extraction = extract_page_html(
|
|
299
|
+
client=client,
|
|
300
|
+
page_id=page_id,
|
|
301
|
+
logger=logger,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if text_extraction.html:
|
|
305
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
306
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
307
|
+
with download_path.open("w") as page_file:
|
|
308
|
+
page_file.write(text_extraction.html.render(pretty=True))
|
|
309
|
+
return self.generate_download_response(
|
|
310
|
+
file_data=file_data, download_path=download_path
|
|
311
|
+
)
|
|
312
|
+
else:
|
|
313
|
+
logger.error(f"No HTML content for page {page_id}")
|
|
314
|
+
return None
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.error(f"Error downloading page {page_id}: {e}")
|
|
317
|
+
return None
|
|
318
|
+
|
|
319
|
+
def download_database(self, client, database_id: str, file_data: FileData) -> DownloadResponse:
|
|
320
|
+
from unstructured_ingest.processes.connectors.notion.helpers import extract_database_html
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
text_extraction = extract_database_html(
|
|
324
|
+
client=client,
|
|
325
|
+
database_id=database_id,
|
|
326
|
+
logger=logger,
|
|
327
|
+
)
|
|
328
|
+
if text_extraction.html:
|
|
329
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
330
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
331
|
+
with download_path.open("w") as database_file:
|
|
332
|
+
database_file.write(text_extraction.html.render(pretty=True))
|
|
333
|
+
return self.generate_download_response(
|
|
334
|
+
file_data=file_data, download_path=download_path
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
logger.error(f"No HTML content for database {database_id}")
|
|
338
|
+
return None
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.error(f"Error downloading database {database_id}: {e}")
|
|
341
|
+
return None
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
notion_source_entry = SourceRegistryEntry(
|
|
345
|
+
connection_config=NotionConnectionConfig,
|
|
346
|
+
indexer_config=NotionIndexerConfig,
|
|
347
|
+
indexer=NotionIndexer,
|
|
348
|
+
downloader_config=NotionDownloaderConfig,
|
|
349
|
+
downloader=NotionDownloader,
|
|
350
|
+
)
|