unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,349 @@
1
+ from typing import Any, Generator, List, Optional, Tuple
2
+
3
+ import httpx
4
+ import notion_client.errors
5
+ from notion_client import Client as NotionClient
6
+ from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
7
+ from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint
8
+ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
9
+ from notion_client.api_endpoints import Endpoint
10
+ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint
11
+
12
+ from unstructured_ingest.error import SourceConnectionError, TimeoutError
13
+ from unstructured_ingest.processes.connectors.notion.ingest_backoff import RetryHandler
14
+ from unstructured_ingest.processes.connectors.notion.ingest_backoff.types import RetryStrategyConfig
15
+ from unstructured_ingest.processes.connectors.notion.types.block import Block
16
+ from unstructured_ingest.processes.connectors.notion.types.database import Database
17
+ from unstructured_ingest.processes.connectors.notion.types.database_properties import map_cells
18
+ from unstructured_ingest.processes.connectors.notion.types.page import Page
19
+ from unstructured_ingest.utils.dep_check import requires_dependencies
20
+
21
+
22
+ @requires_dependencies(["httpx"], extras="notion")
23
+ def _get_retry_strategy(
24
+ endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig
25
+ ) -> RetryHandler:
26
+ import backoff
27
+ import httpx
28
+
29
+ retryable_exceptions = (
30
+ httpx.TimeoutException,
31
+ httpx.HTTPStatusError,
32
+ notion_client.errors.HTTPResponseError,
33
+ )
34
+
35
+ return RetryHandler(
36
+ backoff.expo,
37
+ retryable_exceptions,
38
+ max_time=retry_strategy_config.max_retry_time,
39
+ max_tries=retry_strategy_config.max_retries,
40
+ logger=endpoint.parent.logger,
41
+ start_log_level=endpoint.parent.logger.level,
42
+ backoff_log_level=endpoint.parent.logger.level,
43
+ )
44
+
45
+
46
+ def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
47
+ if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
48
+ return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config)
49
+ return None
50
+
51
+
52
+ class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
53
+ def __init__(
54
+ self,
55
+ *args,
56
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
57
+ **kwargs,
58
+ ):
59
+ super().__init__(*args, **kwargs)
60
+ self.retry_strategy_config = retry_strategy_config
61
+
62
+ @property
63
+ def retry_handler(self) -> Optional[RetryHandler]:
64
+ return get_retry_handler(self)
65
+
66
+ def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]:
67
+ resp: dict = (
68
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
69
+ if self.retry_handler
70
+ else super().list(block_id=block_id, **kwargs)
71
+ ) # type: ignore
72
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
73
+ return child_blocks, resp
74
+
75
+ def iterate_list(
76
+ self,
77
+ block_id: str,
78
+ **kwargs: Any,
79
+ ) -> Generator[List[Block], None, None]:
80
+ next_cursor = None
81
+ while True:
82
+ response: dict = (
83
+ self.retry_handler(
84
+ super().list, block_id=block_id, start_cursor=next_cursor, **kwargs
85
+ )
86
+ if self.retry_handler
87
+ else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs)
88
+ ) # type: ignore
89
+ child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])]
90
+ yield child_blocks
91
+
92
+ next_cursor = response.get("next_cursor")
93
+ if not response.get("has_more") or not next_cursor:
94
+ return
95
+
96
+
97
+ class DatabasesEndpoint(NotionDatabasesEndpoint):
98
+ def __init__(
99
+ self,
100
+ *args,
101
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
102
+ **kwargs,
103
+ ):
104
+ super().__init__(*args, **kwargs)
105
+ self.retry_strategy_config = retry_strategy_config
106
+
107
+ @property
108
+ def retry_handler(self) -> Optional[RetryHandler]:
109
+ return get_retry_handler(self)
110
+
111
+ def retrieve(self, database_id: str, **kwargs: Any) -> Database:
112
+ resp: dict = (
113
+ self.retry_handler(super().retrieve, database_id=database_id, **kwargs)
114
+ if (self.retry_handler)
115
+ else (super().retrieve(database_id=database_id, **kwargs))
116
+ ) # type: ignore
117
+ return Database.from_dict(data=resp)
118
+
119
+ @requires_dependencies(["httpx"], extras="notion")
120
+ def retrieve_status(self, database_id: str, **kwargs) -> int:
121
+ import httpx
122
+
123
+ request = self.parent._build_request(
124
+ method="HEAD",
125
+ path=f"databases/{database_id}",
126
+ auth=kwargs.get("auth"),
127
+ )
128
+ try:
129
+ response: httpx.Response = (
130
+ self.retry_handler(self.parent.client.send, request)
131
+ if (self.retry_handler)
132
+ else (self.parent.client.send(request))
133
+ ) # type: ignore
134
+ return response.status_code
135
+ except httpx.TimeoutException as e:
136
+ raise TimeoutError(str(e))
137
+
138
+ def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]:
139
+ """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database.
140
+
141
+ *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)*
142
+ """ # noqa: E501
143
+ resp: dict = (
144
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
145
+ if (self.retry_handler)
146
+ else (super().query(database_id=database_id, **kwargs))
147
+ ) # type: ignore
148
+ pages = [Page.from_dict(data=p) for p in resp.pop("results")]
149
+ for p in pages:
150
+ p.properties = map_cells(p.properties)
151
+ return pages, resp
152
+
153
+ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]:
154
+ next_cursor = None
155
+ while True:
156
+ response: dict = (
157
+ self.retry_handler(
158
+ super().query, database_id=database_id, start_cursor=next_cursor, **kwargs
159
+ )
160
+ if (self.retry_handler)
161
+ else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs))
162
+ ) # type: ignore
163
+ pages = [Page.from_dict(data=p) for p in response.pop("results", [])]
164
+ for p in pages:
165
+ p.properties = map_cells(p.properties)
166
+ yield pages
167
+
168
+ next_cursor = response.get("next_cursor")
169
+ if not response.get("has_more") or not next_cursor:
170
+ return
171
+
172
+
173
+ class BlocksEndpoint(NotionBlocksEndpoint):
174
+ def __init__(
175
+ self,
176
+ *args: Any,
177
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
178
+ **kwargs: Any,
179
+ ) -> None:
180
+ super().__init__(*args, **kwargs)
181
+ self.retry_strategy_config = retry_strategy_config
182
+ self.children = BlocksChildrenEndpoint(
183
+ retry_strategy_config=retry_strategy_config,
184
+ *args,
185
+ **kwargs,
186
+ )
187
+
188
+ @property
189
+ def retry_handler(self) -> Optional[RetryHandler]:
190
+ return get_retry_handler(self)
191
+
192
+ def retrieve(self, block_id: str, **kwargs: Any) -> Block:
193
+ resp: dict = (
194
+ self.retry_handler(super().retrieve, block_id=block_id, **kwargs)
195
+ if (self.retry_handler)
196
+ else (super().retrieve(block_id=block_id, **kwargs))
197
+ ) # type: ignore
198
+ return Block.from_dict(data=resp)
199
+
200
+
201
+ class PagesEndpoint(NotionPagesEndpoint):
202
+ def __init__(
203
+ self,
204
+ *args,
205
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
206
+ **kwargs,
207
+ ):
208
+ super().__init__(*args, **kwargs)
209
+ self.retry_strategy_config = retry_strategy_config
210
+
211
+ @property
212
+ def retry_handler(self) -> Optional[RetryHandler]:
213
+ return get_retry_handler(self)
214
+
215
+ def retrieve(self, page_id: str, **kwargs: Any) -> Page:
216
+ resp: dict = (
217
+ self.retry_handler(super().retrieve, page_id=page_id, **kwargs)
218
+ if (self.retry_handler)
219
+ else (super().retrieve(page_id=page_id, **kwargs))
220
+ ) # type: ignore
221
+ return Page.from_dict(data=resp)
222
+
223
+ @requires_dependencies(["httpx"], extras="notion")
224
+ def retrieve_status(self, page_id: str, **kwargs) -> int:
225
+ import httpx
226
+
227
+ request = self.parent._build_request(
228
+ method="HEAD",
229
+ path=f"pages/{page_id}",
230
+ auth=kwargs.get("auth"),
231
+ )
232
+ try:
233
+ response: httpx.Response = (
234
+ self.retry_handler(self.parent.client.send, request)
235
+ if (self.retry_handler)
236
+ else (self.parent.client.send(request))
237
+ ) # type: ignore
238
+ return response.status_code
239
+ except httpx.TimeoutException as e:
240
+ raise TimeoutError(str(e))
241
+
242
+
243
+ class Client(NotionClient):
244
+ def __init__(
245
+ self,
246
+ *args: Any,
247
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
248
+ **kwargs: Any,
249
+ ) -> None:
250
+ super().__init__(*args, **kwargs)
251
+ self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
252
+ self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
253
+ self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
254
+
255
+
256
+ class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
257
+ def __init__(self, *args, **kwargs):
258
+ super().__init__(*args, **kwargs)
259
+ self._http_client = httpx.AsyncClient()
260
+
261
+ async def list(self, block_id: str, **kwargs: Any) -> tuple[List[Block], dict]:
262
+ """Fetch the list of child blocks asynchronously."""
263
+ try:
264
+ response = await self._http_client.get(
265
+ f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs
266
+ )
267
+ response.raise_for_status()
268
+ except httpx.HTTPStatusError as e:
269
+ raise SourceConnectionError(f"Failed to list blocks: {str(e)}")
270
+ except httpx.TimeoutException as e:
271
+ raise TimeoutError(str(e))
272
+
273
+ resp = response.json()
274
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
275
+ return child_blocks, resp
276
+
277
+ async def iterate_list(
278
+ self, block_id: str, **kwargs: Any
279
+ ) -> Generator[List[Block], None, None]:
280
+ """Fetch the list of child blocks in pages asynchronously."""
281
+ next_cursor = None
282
+ while True:
283
+ params = {"start_cursor": next_cursor} if next_cursor else {}
284
+ params.update(kwargs)
285
+ child_blocks, response = await self.list(block_id, **params)
286
+ yield child_blocks
287
+
288
+ next_cursor = response.get("next_cursor")
289
+ if not response.get("has_more") or not next_cursor:
290
+ return
291
+
292
+ async def close(self):
293
+ """Close the HTTP client."""
294
+ await self._http_client.aclose()
295
+
296
+
297
+ class AsyncDatabasesEndpoint(NotionDatabasesEndpoint):
298
+ def __init__(self, *args, **kwargs):
299
+ super().__init__(*args, **kwargs)
300
+ self._http_client = httpx.AsyncClient()
301
+
302
+ async def retrieve(self, database_id: str, **kwargs: Any) -> Database:
303
+ """Fetch a database by its ID asynchronously."""
304
+ try:
305
+ response = await self._http_client.get(
306
+ f"{self.parent._api_base}/databases/{database_id}", **kwargs
307
+ )
308
+ response.raise_for_status()
309
+ except httpx.HTTPStatusError as e:
310
+ raise SourceConnectionError(f"Failed to retrieve database: {str(e)}")
311
+ except httpx.TimeoutException as e:
312
+ raise TimeoutError(str(e))
313
+
314
+ return Database.from_dict(data=response.json())
315
+
316
+ async def query(self, database_id: str, **kwargs: Any) -> tuple[List[Page], dict]:
317
+ """Query a database asynchronously."""
318
+ try:
319
+ response = await self._http_client.post(
320
+ f"{self.parent._api_base}/databases/{database_id}/query",
321
+ json=kwargs.get("json", {}),
322
+ )
323
+ response.raise_for_status()
324
+ except httpx.HTTPStatusError as e:
325
+ raise SourceConnectionError(f"Failed to query database: {str(e)}")
326
+ except httpx.TimeoutException as e:
327
+ raise TimeoutError(str(e))
328
+
329
+ resp = response.json()
330
+ pages = [Page.from_dict(data=p) for p in resp.pop("results", [])]
331
+ for p in pages:
332
+ p.properties = map_cells(p.properties)
333
+ return pages, resp
334
+
335
+ async def close(self):
336
+ """Close the HTTP client."""
337
+ await self._http_client.aclose()
338
+
339
+
340
+ class AsyncClient(NotionClient):
341
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
342
+ super().__init__(*args, **kwargs)
343
+ self.blocks = AsyncBlocksChildrenEndpoint(parent=self)
344
+ self.databases = AsyncDatabasesEndpoint(parent=self)
345
+
346
+ async def close(self):
347
+ """Close all async endpoints."""
348
+ await self.blocks.close()
349
+ await self.databases.close()
@@ -0,0 +1,350 @@
1
+ from dataclasses import dataclass
2
+ from time import time
3
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
4
+
5
+ from pydantic import UUID4, Field, Secret
6
+
7
+ from unstructured_ingest.data_types.file_data import (
8
+ FileData,
9
+ FileDataSourceMetadata,
10
+ SourceIdentifiers,
11
+ )
12
+ from unstructured_ingest.error import SourceConnectionError, ValueError
13
+ from unstructured_ingest.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ Indexer,
20
+ IndexerConfig,
21
+ )
22
+ from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
24
+ from unstructured_ingest.utils.dep_check import requires_dependencies
25
+
26
+ if TYPE_CHECKING:
27
+ from unstructured_ingest.processes.connectors.notion.client import Client
28
+
29
+ NOTION_API_VERSION = "2022-06-28"
30
+ CONNECTOR_TYPE = "notion"
31
+
32
+
33
+ class NotionAccessConfig(AccessConfig):
34
+ notion_api_key: str = Field(description="Notion API key")
35
+
36
+
37
+ class NotionConnectionConfig(ConnectionConfig):
38
+ access_config: Secret[NotionAccessConfig]
39
+
40
+ @requires_dependencies(["notion_client"], extras="notion")
41
+ def get_client(self) -> "Client":
42
+ from unstructured_ingest.processes.connectors.notion.client import Client
43
+
44
+ return Client(
45
+ notion_version=NOTION_API_VERSION,
46
+ auth=self.access_config.get_secret_value().notion_api_key,
47
+ logger=logger,
48
+ log_level=logger.level,
49
+ )
50
+
51
+
52
+ class NotionIndexerConfig(IndexerConfig):
53
+ page_ids: Optional[list[str]] = Field(
54
+ default=None, description="List of Notion page IDs to process"
55
+ )
56
+
57
+ database_ids: Optional[list[str]] = Field(
58
+ default=None, description="List of Notion database IDs to process"
59
+ )
60
+ recursive: bool = Field(
61
+ default=False, description="Recursively process child pages and databases"
62
+ )
63
+
64
+ def __post_init__(self):
65
+ if self.page_ids:
66
+ self.page_ids: list[UUID4] = [UUID4(p.strip()) for p in self.page_ids]
67
+
68
+ if self.database_ids:
69
+ self.database_ids: list[UUID4] = [UUID4(p.strip()) for p in self.database_ids]
70
+
71
+
72
+ @dataclass
73
+ class NotionIndexer(Indexer):
74
+ connection_config: NotionConnectionConfig
75
+ index_config: NotionIndexerConfig
76
+
77
+ def is_async(self) -> bool:
78
+ return False
79
+
80
+ def precheck(self) -> None:
81
+ """Check the connection to the Notion API."""
82
+ try:
83
+ client = self.connection_config.get_client()
84
+ # Perform a simple request to verify connection
85
+ request = client._build_request("HEAD", "users")
86
+ response = client.client.send(request)
87
+ response.raise_for_status()
88
+
89
+ except Exception as e:
90
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
91
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
92
+
93
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
94
+ client = self.connection_config.get_client()
95
+ processed_pages: set[str] = set()
96
+ processed_databases: set[str] = set()
97
+
98
+ pages_to_process: set[str] = set(self.index_config.page_ids or [])
99
+ databases_to_process: set[str] = set(self.index_config.database_ids or [])
100
+
101
+ while pages_to_process or databases_to_process:
102
+ # Process pages
103
+ for page_id in list(pages_to_process):
104
+ if page_id in processed_pages:
105
+ continue
106
+
107
+ processed_pages.add(page_id)
108
+ pages_to_process.remove(page_id)
109
+ file_data = self.get_page_file_data(page_id=page_id, client=client)
110
+ if file_data:
111
+ yield file_data
112
+
113
+ if self.index_config.recursive:
114
+ (child_pages, child_databases) = self.get_child_pages_and_databases(
115
+ page_id=page_id,
116
+ client=client,
117
+ processed_pages=processed_pages,
118
+ processed_databases=processed_databases,
119
+ )
120
+ pages_to_process.update(child_pages)
121
+ databases_to_process.update(child_databases)
122
+
123
+ # Process databases
124
+ for database_id in list(databases_to_process):
125
+ if database_id in processed_databases:
126
+ continue
127
+ processed_databases.add(database_id)
128
+ databases_to_process.remove(database_id)
129
+ file_data = self.get_database_file_data(database_id=database_id, client=client)
130
+ if file_data:
131
+ yield file_data
132
+ if self.index_config.recursive:
133
+ (
134
+ child_pages,
135
+ child_databases,
136
+ ) = self.get_child_pages_and_databases_from_database(
137
+ database_id=database_id,
138
+ client=client,
139
+ processed_pages=processed_pages,
140
+ processed_databases=processed_databases,
141
+ )
142
+ pages_to_process.update(child_pages)
143
+ databases_to_process.update(child_databases)
144
+
145
+ @requires_dependencies(["notion_client"], extras="notion")
146
+ def get_page_file_data(self, page_id: str, client: "Client") -> Optional[FileData]:
147
+ try:
148
+ page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore
149
+ date_created = page_metadata.created_time
150
+ date_modified = page_metadata.last_edited_time
151
+ identifier = page_id
152
+ source_identifiers = SourceIdentifiers(
153
+ filename=f"{page_id}.html",
154
+ fullpath=f"{page_id}.html",
155
+ rel_path=f"{page_id}.html",
156
+ )
157
+ metadata = FileDataSourceMetadata(
158
+ date_created=date_created,
159
+ date_modified=date_modified,
160
+ record_locator={"page_id": page_id},
161
+ date_processed=str(time()),
162
+ )
163
+ # additional_metadata = page_metadata
164
+ additional_metadata = {
165
+ "created_by": page_metadata.created_by,
166
+ "last_edited_by": page_metadata.last_edited_by,
167
+ "parent": page_metadata.parent,
168
+ "url": page_metadata.url,
169
+ }
170
+
171
+ return FileData(
172
+ identifier=identifier,
173
+ connector_type=CONNECTOR_TYPE,
174
+ source_identifiers=source_identifiers,
175
+ metadata=metadata,
176
+ additional_metadata=additional_metadata,
177
+ display_name=source_identifiers.fullpath,
178
+ )
179
+ except Exception as e:
180
+ logger.error(f"Error retrieving page {page_id}: {e}")
181
+ return None
182
+
183
+ @requires_dependencies(["notion_client"], extras="notion")
184
+ def get_database_file_data(self, database_id: str, client: "Client") -> Optional[FileData]:
185
+ try:
186
+ # type: ignore
187
+ database_metadata = client.databases.retrieve(database_id=database_id)
188
+ date_created = database_metadata.created_time
189
+ date_modified = database_metadata.last_edited_time
190
+ identifier = database_id
191
+ source_identifiers = SourceIdentifiers(
192
+ filename=f"{database_id}.html",
193
+ fullpath=f"{database_id}.html",
194
+ rel_path=f"{database_id}.html",
195
+ )
196
+ metadata = FileDataSourceMetadata(
197
+ date_created=date_created,
198
+ date_modified=date_modified,
199
+ record_locator={"database_id": database_id},
200
+ date_processed=str(time()),
201
+ )
202
+ additional_metadata = {
203
+ "created_by": database_metadata.created_by,
204
+ "last_edited_by": database_metadata.last_edited_by,
205
+ "parent": database_metadata.parent,
206
+ "url": database_metadata.url,
207
+ }
208
+ return FileData(
209
+ identifier=identifier,
210
+ connector_type=CONNECTOR_TYPE,
211
+ source_identifiers=source_identifiers,
212
+ metadata=metadata,
213
+ additional_metadata=additional_metadata,
214
+ display_name=source_identifiers.fullpath,
215
+ )
216
+ except Exception as e:
217
+ logger.error(f"Error retrieving database {database_id}: {e}")
218
+ return None
219
+
220
+ def get_child_pages_and_databases(
221
+ self,
222
+ page_id: str,
223
+ client: "Client",
224
+ processed_pages: set[str],
225
+ processed_databases: set[str],
226
+ ) -> tuple[set[str], set[str]]:
227
+ from unstructured_ingest.processes.connectors.notion.helpers import (
228
+ get_recursive_content_from_page,
229
+ )
230
+
231
+ child_content = get_recursive_content_from_page(
232
+ client=client,
233
+ page_id=page_id,
234
+ logger=logger,
235
+ )
236
+ child_pages = set(child_content.child_pages) - processed_pages
237
+ child_databases = set(child_content.child_databases) - processed_databases
238
+ return child_pages, child_databases
239
+
240
+ def get_child_pages_and_databases_from_database(
241
+ self,
242
+ database_id: str,
243
+ client: "Client",
244
+ processed_pages: set[str],
245
+ processed_databases: set[str],
246
+ ) -> tuple[set[str], set[str]]:
247
+ from unstructured_ingest.processes.connectors.notion.helpers import (
248
+ get_recursive_content_from_database,
249
+ )
250
+
251
+ child_content = get_recursive_content_from_database(
252
+ client=client,
253
+ database_id=database_id,
254
+ logger=logger,
255
+ )
256
+ child_pages = set(child_content.child_pages) - processed_pages
257
+ child_databases = set(child_content.child_databases) - processed_databases
258
+ return child_pages, child_databases
259
+
260
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]:
261
+ # Asynchronous run is not implemented
262
+ raise NotImplementedError()
263
+
264
+
265
+ class NotionDownloaderConfig(DownloaderConfig):
266
+ pass
267
+
268
+
269
+ @dataclass
270
+ class NotionDownloader(Downloader):
271
+ connection_config: NotionConnectionConfig
272
+ download_config: NotionDownloaderConfig
273
+ connector_type: str = CONNECTOR_TYPE
274
+
275
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
276
+ client = self.connection_config.get_client()
277
+ record_locator = file_data.metadata.record_locator
278
+
279
+ if "page_id" in record_locator:
280
+ return self.download_page(
281
+ client=client,
282
+ page_id=record_locator["page_id"],
283
+ file_data=file_data,
284
+ )
285
+ elif "database_id" in record_locator:
286
+ return self.download_database(
287
+ client=client,
288
+ database_id=record_locator["database_id"],
289
+ file_data=file_data,
290
+ )
291
+ else:
292
+ raise ValueError("Invalid record_locator in file_data")
293
+
294
+ def download_page(self, client, page_id: str, file_data: FileData) -> DownloadResponse:
295
+ from unstructured_ingest.processes.connectors.notion.helpers import extract_page_html
296
+
297
+ try:
298
+ text_extraction = extract_page_html(
299
+ client=client,
300
+ page_id=page_id,
301
+ logger=logger,
302
+ )
303
+
304
+ if text_extraction.html:
305
+ download_path = self.get_download_path(file_data=file_data)
306
+ download_path.parent.mkdir(parents=True, exist_ok=True)
307
+ with download_path.open("w") as page_file:
308
+ page_file.write(text_extraction.html.render(pretty=True))
309
+ return self.generate_download_response(
310
+ file_data=file_data, download_path=download_path
311
+ )
312
+ else:
313
+ logger.error(f"No HTML content for page {page_id}")
314
+ return None
315
+ except Exception as e:
316
+ logger.error(f"Error downloading page {page_id}: {e}")
317
+ return None
318
+
319
+ def download_database(self, client, database_id: str, file_data: FileData) -> DownloadResponse:
320
+ from unstructured_ingest.processes.connectors.notion.helpers import extract_database_html
321
+
322
+ try:
323
+ text_extraction = extract_database_html(
324
+ client=client,
325
+ database_id=database_id,
326
+ logger=logger,
327
+ )
328
+ if text_extraction.html:
329
+ download_path = self.get_download_path(file_data=file_data)
330
+ download_path.parent.mkdir(parents=True, exist_ok=True)
331
+ with download_path.open("w") as database_file:
332
+ database_file.write(text_extraction.html.render(pretty=True))
333
+ return self.generate_download_response(
334
+ file_data=file_data, download_path=download_path
335
+ )
336
+ else:
337
+ logger.error(f"No HTML content for database {database_id}")
338
+ return None
339
+ except Exception as e:
340
+ logger.error(f"Error downloading database {database_id}: {e}")
341
+ return None
342
+
343
+
344
+ notion_source_entry = SourceRegistryEntry(
345
+ connection_config=NotionConnectionConfig,
346
+ indexer_config=NotionIndexerConfig,
347
+ indexer=NotionIndexer,
348
+ downloader_config=NotionDownloaderConfig,
349
+ downloader=NotionDownloader,
350
+ )