unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,49 @@
1
+ # https://developers.notion.com/reference/block#file
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.file import External
10
+ from unstructured_ingest.processes.connectors.notion.types.file import File as FileContent
11
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
12
+
13
+
14
+ @dataclass
15
+ class File(BlockBase):
16
+ type: str
17
+ external: Optional[External] = None
18
+ file: Optional[FileContent] = None
19
+ caption: List[RichText] = field(default_factory=list)
20
+
21
+ @staticmethod
22
+ def can_have_children() -> bool:
23
+ return False
24
+
25
+ @classmethod
26
+ def from_dict(cls, data: dict):
27
+ caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])]
28
+ t = data["type"]
29
+ file = cls(type=t, caption=caption)
30
+ if t == "external":
31
+ file.external = External.from_dict(data["external"])
32
+ elif t == "file":
33
+ file.file = FileContent.from_dict(data["file"])
34
+ return file
35
+
36
+ def get_html(self) -> Optional[HtmlTag]:
37
+ texts = []
38
+ if self.file:
39
+ texts.append(A([Href(self.file.url)], self.file.url))
40
+ if self.external:
41
+ texts.append(A([Href(self.external.url)], self.external.url))
42
+ if self.caption:
43
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
44
+ if not texts:
45
+ return None
46
+ joined = [Br()] * (len(texts) * 2 - 1)
47
+ joined[0::2] = texts
48
+
49
+ return Div([], joined)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#headings
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Heading(BlockBase):
14
+ color: str
15
+ is_toggleable: bool
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return False
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ heading = cls(**data)
26
+ heading.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return heading
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,21 @@
1
+ # https://developers.notion.com/reference/block#image
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.attributes import Src
5
+ from htmlBuilder.tags import HtmlTag, Img
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+ from unstructured_ingest.processes.connectors.notion.types.file import FileObject
9
+
10
+
11
+ class Image(BlockBase, FileObject):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return False
15
+
16
+ def get_html(self) -> Optional[HtmlTag]:
17
+ if self.external:
18
+ return Img([Src(self.external.url)], [])
19
+ if self.file:
20
+ return Img([Src(self.file.url)], [])
21
+ return None
@@ -0,0 +1,24 @@
1
+ # https://developers.notion.com/reference/block#link-preview
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+
10
+
11
+ @dataclass
12
+ class LinkPreview(BlockBase):
13
+ url: str
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return False
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ return cls(**data)
22
+
23
+ def get_html(self) -> Optional[HtmlTag]:
24
+ return A([Href(self.url)], self.url)
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#link-to-page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class LinkToPage(BlockBase):
12
+ type: str
13
+ page_id: Optional[str] = None
14
+ database_id: Optional[str] = None
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return False
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ return cls(**data)
23
+
24
+ def get_html(self) -> Optional[HtmlTag]:
25
+ if page_id := self.page_id:
26
+ return Div([], page_id)
27
+ if database_id := self.database_id:
28
+ return Div([], database_id)
29
+ return None
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#numbered-list-item
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Li
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class NumberedListItem(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ numbered_list = cls(**data)
25
+ numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return numbered_list
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ return Li([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,31 @@
1
+ # https://developers.notion.com/reference/block#paragraph
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Br, Div, HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Paragraph(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ paragraph = cls(**data)
25
+ paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return paragraph
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ if not self.rich_text:
30
+ return Br()
31
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,49 @@
1
+ # https://developers.notion.com/reference/block#pdf
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.file import External, File
10
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
11
+
12
+
13
+ @dataclass
14
+ class PDF(BlockBase):
15
+ type: str
16
+ caption: List[RichText] = field(default_factory=list)
17
+ external: Optional[External] = None
18
+ file: Optional[File] = None
19
+
20
+ @staticmethod
21
+ def can_have_children() -> bool:
22
+ return False
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict):
26
+ caption = data.pop("caption", [])
27
+ t = data["type"]
28
+ paragraph = cls(type=t)
29
+ paragraph.caption = [RichText.from_dict(c) for c in caption]
30
+ if t == "external":
31
+ paragraph.external = External.from_dict(data["external"])
32
+ elif t == "file":
33
+ paragraph.file = File.from_dict(data["file"])
34
+ return paragraph
35
+
36
+ def get_html(self) -> Optional[HtmlTag]:
37
+ texts = []
38
+ if self.external:
39
+ texts.append(A([Href(self.external.url)], self.external.url))
40
+ if self.file:
41
+ texts.append(A([Href(self.file.url)], self.file.url))
42
+ if self.caption:
43
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
44
+ if not texts:
45
+ return None
46
+ joined = [Br()] * (len(texts) * 2 - 1)
47
+ joined[0::2] = texts
48
+
49
+ return Div([], joined)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#quote
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Quote(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ quote = cls(**data)
26
+ quote.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return quote
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,109 @@
1
+ # https://developers.notion.com/reference/block#synced-block
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class OriginalSyncedBlock(BlockBase):
12
+ synced_from: Optional[str] = None
13
+ children: List[dict] = field(default_factory=list)
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return True
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ """Create OriginalSyncedBlock from dictionary data.
22
+
23
+ Original blocks contain children content.
24
+ """
25
+ if "children" not in data:
26
+ raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
27
+ return cls(children=data["children"])
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ return None
31
+
32
+
33
+ @dataclass
34
+ class DuplicateSyncedBlock(BlockBase):
35
+ type: str
36
+ block_id: str
37
+
38
+ @staticmethod
39
+ def can_have_children() -> bool:
40
+ """Check if duplicate synced blocks can have children.
41
+
42
+ Duplicate blocks themselves don't have children directly fetched here,
43
+ but they represent content that does, so Notion API might report has_children=True
44
+ on the parent block object. The actual children are fetched from the original block.
45
+ """
46
+ return True
47
+
48
+ @classmethod
49
+ def from_dict(cls, data: dict):
50
+ """Create DuplicateSyncedBlock from dictionary data.
51
+
52
+ Duplicate blocks contain a 'synced_from' reference.
53
+ """
54
+ synced_from_data = data.get("synced_from")
55
+ if not synced_from_data or not isinstance(synced_from_data, dict):
56
+ raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}")
57
+ # Ensure required keys are present in the nested dictionary
58
+ if "type" not in synced_from_data or "block_id" not in synced_from_data:
59
+ raise ValueError(
60
+ f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}"
61
+ )
62
+ return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"])
63
+
64
+ def get_html(self) -> Optional[HtmlTag]:
65
+ """Get HTML representation of the duplicate synced block.
66
+
67
+ HTML representation might need fetching the original block's content,
68
+ which is outside the scope of this simple data class.
69
+ """
70
+ return None
71
+
72
+
73
+ class SyncBlock(BlockBase):
74
+ @staticmethod
75
+ def can_have_children() -> bool:
76
+ """Check if synced blocks can have children.
77
+
78
+ Synced blocks (both original and duplicate) can conceptually have children.
79
+ """
80
+ return True
81
+
82
+ @classmethod
83
+ def from_dict(cls, data: dict):
84
+ """Create appropriate SyncedBlock subclass from dictionary data.
85
+
86
+ Determine if it's a duplicate (has 'synced_from') or original (has 'children').
87
+ """
88
+ if data.get("synced_from") is not None:
89
+ # It's a duplicate block containing a reference
90
+ return DuplicateSyncedBlock.from_dict(data)
91
+ elif "children" in data:
92
+ # It's an original block containing children
93
+ return OriginalSyncedBlock.from_dict(data)
94
+ else:
95
+ # Handle cases where neither 'synced_from' nor 'children' are present.
96
+ # Notion API might return this for an empty original synced block.
97
+ # Let's treat it as an empty OriginalSyncedBlock.
98
+ # If this assumption is wrong, errors might occur later.
99
+ # Consider logging a warning here if strictness is needed.
100
+ return OriginalSyncedBlock(children=[])
101
+
102
+ def get_html(self) -> Optional[HtmlTag]:
103
+ """Get HTML representation of the synced block.
104
+
105
+ The specific instance returned by from_dict (Original or Duplicate)
106
+ will handle its own get_html logic.
107
+ This method on the base SyncBlock might not be directly called.
108
+ """
109
+ return None
@@ -0,0 +1,60 @@
1
+ # https://developers.notion.com/reference/block#table
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Td, Th, Tr
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase, FromJSONMixin
8
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Table(BlockBase):
13
+ table_width: int
14
+ has_column_header: bool
15
+ has_row_header: bool
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ return cls(**data)
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ return None
27
+
28
+
29
+ @dataclass
30
+ class TableCell(FromJSONMixin):
31
+ rich_texts: List[RichText]
32
+
33
+ @classmethod
34
+ def from_dict(cls, data: dict):
35
+ return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])])
36
+
37
+ def get_html(self, is_header: bool) -> Optional[HtmlTag]:
38
+ if is_header:
39
+ return Th([], [rt.get_html() for rt in self.rich_texts])
40
+ else:
41
+ return Td([], [rt.get_html() for rt in self.rich_texts])
42
+
43
+
44
+ # https://developers.notion.com/reference/block#table-rows
45
+ @dataclass
46
+ class TableRow(BlockBase):
47
+ is_header: bool = False
48
+ cells: List[TableCell] = field(default_factory=list)
49
+
50
+ @classmethod
51
+ def from_dict(cls, data: dict):
52
+ cells = data.get("cells", [])
53
+ return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells])
54
+
55
+ @staticmethod
56
+ def can_have_children() -> bool:
57
+ return False
58
+
59
+ def get_html(self) -> Optional[HtmlTag]:
60
+ return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells])
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#table-of-contents
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class TableOfContents(BlockBase):
12
+ color: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return False
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return None
@@ -0,0 +1,30 @@
1
+ # https://developers.notion.com/reference/block#template
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Template(BlockBase):
13
+ children: List[dict] = field(default_factory=list)
14
+ rich_text: List[RichText] = field(default_factory=list)
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return True
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ rich_text = data.pop("rich_text", [])
23
+ template = cls(**data)
24
+ template.rich_text = [RichText.from_dict(rt) for rt in rich_text]
25
+ return template
26
+
27
+ def get_html(self) -> Optional[HtmlTag]:
28
+ if not self.rich_text:
29
+ return None
30
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,42 @@
1
+ # https://developers.notion.com/reference/block#to-do
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Checked, Style, Type
6
+ from htmlBuilder.tags import Div, HtmlTag, Input
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class ToDo(BlockBase):
14
+ color: str
15
+ checked: bool = False
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ todo = cls(**data)
26
+ todo.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return todo
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ elements = []
34
+ check_input_attributes = [Type("checkbox")]
35
+ if self.checked:
36
+ check_input_attributes.append(Checked(""))
37
+ elements.append(Input(check_input_attributes))
38
+ elements.extend([rt.get_html() for rt in self.rich_text])
39
+ attributes = []
40
+ if self.color and self.color != "default":
41
+ attributes.append(Style(f"color: {self.color}"))
42
+ return Div(attributes, elements)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#toggle-blocks
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Toggle(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ toggle = cls(**data)
26
+ toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return toggle
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.tags import HtmlTag
5
+
6
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
7
+
8
+
9
+ @dataclass
10
+ class Unsupported(BlockBase):
11
+ @staticmethod
12
+ def can_have_children() -> bool:
13
+ return False
14
+
15
+ @classmethod
16
+ def from_dict(cls, data: dict):
17
+ return cls()
18
+
19
+ def get_html(self) -> Optional[HtmlTag]:
20
+ return None
@@ -0,0 +1,22 @@
1
+ # https://developers.notion.com/reference/block#image
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.attributes import Src
5
+ from htmlBuilder.tags import HtmlTag, Source
6
+ from htmlBuilder.tags import Video as VideoHtml
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.file import FileObject
10
+
11
+
12
+ class Video(BlockBase, FileObject):
13
+ @staticmethod
14
+ def can_have_children() -> bool:
15
+ return False
16
+
17
+ def get_html(self) -> Optional[HtmlTag]:
18
+ if self.external:
19
+ return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])])
20
+ if self.file:
21
+ return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])])
22
+ return None