unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,24 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class RetryStrategyConfig:
7
+ """
8
+ Contains all info needed for decorator to pull from `self` for backoff
9
+ and retry triggered by exception.
10
+
11
+ Args:
12
+ max_retries: The maximum number of attempts to make before giving
13
+ up. Once exhausted, the exception will be allowed to escape.
14
+ The default value of None means there is no limit to the
15
+ number of tries. If a callable is passed, it will be
16
+ evaluated at runtime and its return value used.
17
+ max_retry_time: The maximum total amount of time to try for before
18
+ giving up. Once expired, the exception will be allowed to
19
+ escape. If a callable is passed, it will be
20
+ evaluated at runtime and its return value used.
21
+ """
22
+
23
+ max_retries: Optional[int] = None
24
+ max_retry_time: Optional[float] = None
@@ -0,0 +1,32 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.tags import HtmlTag
5
+
6
+
7
+ class FromJSONMixin(ABC):
8
+ @classmethod
9
+ @abstractmethod
10
+ def from_dict(cls, data: dict):
11
+ pass
12
+
13
+
14
+ class GetHTMLMixin(ABC):
15
+ @abstractmethod
16
+ def get_html(self) -> Optional[HtmlTag]:
17
+ pass
18
+
19
+
20
+ class BlockBase(FromJSONMixin, GetHTMLMixin):
21
+ @staticmethod
22
+ @abstractmethod
23
+ def can_have_children() -> bool:
24
+ pass
25
+
26
+
27
+ class DBPropertyBase(FromJSONMixin):
28
+ pass
29
+
30
+
31
+ class DBCellBase(FromJSONMixin, GetHTMLMixin):
32
+ pass
@@ -0,0 +1,96 @@
1
+ # https://developers.notion.com/reference/page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import (
8
+ BlockBase,
9
+ FromJSONMixin,
10
+ GetHTMLMixin,
11
+ )
12
+ from unstructured_ingest.processes.connectors.notion.types import blocks
13
+ from unstructured_ingest.processes.connectors.notion.types.parent import Parent
14
+ from unstructured_ingest.processes.connectors.notion.types.user import PartialUser
15
+
16
+ block_type_mapping = {
17
+ "bookmark": blocks.Bookmark,
18
+ "breadcrumb": blocks.Breadcrumb,
19
+ "bulleted_list_item": blocks.BulletedListItem,
20
+ "callout": blocks.Callout,
21
+ "child_database": blocks.ChildDatabase,
22
+ "child_page": blocks.ChildPage,
23
+ "code": blocks.Code,
24
+ "column": blocks.Column,
25
+ "column_list": blocks.ColumnList,
26
+ "divider": blocks.Divider,
27
+ "heading_1": blocks.Heading,
28
+ "heading_2": blocks.Heading,
29
+ "heading_3": blocks.Heading,
30
+ "embed": blocks.Embed,
31
+ "equation": blocks.Equation,
32
+ "file": blocks.File,
33
+ "image": blocks.Image,
34
+ "link_preview": blocks.LinkPreview,
35
+ "link_to_page": blocks.LinkToPage,
36
+ "numbered_list_item": blocks.NumberedListItem,
37
+ "paragraph": blocks.Paragraph,
38
+ "pdf": blocks.PDF,
39
+ "quote": blocks.Quote,
40
+ "synced_block": blocks.SyncBlock,
41
+ "table": blocks.Table,
42
+ "table_of_contents": blocks.TableOfContents,
43
+ "table_row": blocks.TableRow,
44
+ "template": blocks.Template,
45
+ "to_do": blocks.ToDo,
46
+ "toggle": blocks.Toggle,
47
+ "unsupported": blocks.Unsupported,
48
+ "video": blocks.Video,
49
+ }
50
+
51
+
52
+ @dataclass
53
+ class Block(FromJSONMixin, GetHTMLMixin):
54
+ id: str
55
+ type: str
56
+ created_time: str
57
+ created_by: PartialUser
58
+ last_edited_time: str
59
+ last_edited_by: PartialUser
60
+ archived: bool
61
+ in_trash: bool
62
+ has_children: bool
63
+ parent: Parent
64
+ block: BlockBase
65
+ object: str = "block"
66
+ request_id: Optional[str] = None
67
+
68
+ def __repr__(self):
69
+ return f"{self.__class__.__name__}(id={self.id}, type={self.type})"
70
+
71
+ @classmethod
72
+ def from_dict(cls, data: dict):
73
+ t = data["type"]
74
+ block_data = data.pop(t)
75
+ created_by = data.pop("created_by")
76
+ last_edited_by = data.pop("last_edited_by")
77
+ parent = data.pop("parent")
78
+ try:
79
+ block = cls(
80
+ created_by=PartialUser.from_dict(created_by),
81
+ last_edited_by=PartialUser.from_dict(last_edited_by),
82
+ parent=Parent.from_dict(parent),
83
+ block=block_type_mapping[t].from_dict(block_data), # type: ignore
84
+ **data,
85
+ )
86
+ except KeyError as ke:
87
+ raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke
88
+ except TypeError as te:
89
+ raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te
90
+
91
+ return block
92
+
93
+ def get_html(self) -> Optional[HtmlTag]:
94
+ if self.block:
95
+ return self.block.get_html()
96
+ return None
@@ -0,0 +1,63 @@
1
+ from .bookmark import Bookmark
2
+ from .breadcrumb import Breadcrumb
3
+ from .bulleted_list_item import BulletedListItem
4
+ from .callout import Callout
5
+ from .child_database import ChildDatabase
6
+ from .child_page import ChildPage
7
+ from .code import Code
8
+ from .column_list import Column, ColumnList
9
+ from .divider import Divider
10
+ from .embed import Embed
11
+ from .equation import Equation
12
+ from .file import File
13
+ from .heading import Heading
14
+ from .image import Image
15
+ from .link_preview import LinkPreview
16
+ from .link_to_page import LinkToPage
17
+ from .numbered_list import NumberedListItem
18
+ from .paragraph import Paragraph
19
+ from .pdf import PDF
20
+ from .quote import Quote
21
+ from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock
22
+ from .table import Table, TableRow
23
+ from .table_of_contents import TableOfContents
24
+ from .template import Template
25
+ from .todo import ToDo
26
+ from .toggle import Toggle
27
+ from .unsupported import Unsupported
28
+ from .video import Video
29
+
30
+ __all__ = [
31
+ "Bookmark",
32
+ "Breadcrumb",
33
+ "BulletedListItem",
34
+ "Callout",
35
+ "ChildDatabase",
36
+ "ChildPage",
37
+ "Code",
38
+ "Column",
39
+ "ColumnList",
40
+ "Divider",
41
+ "Embed",
42
+ "Equation",
43
+ "File",
44
+ "Heading",
45
+ "Image",
46
+ "LinkPreview",
47
+ "LinkToPage",
48
+ "NumberedListItem",
49
+ "Paragraph",
50
+ "PDF",
51
+ "Quote",
52
+ "SyncBlock",
53
+ "OriginalSyncedBlock",
54
+ "DuplicateSyncedBlock",
55
+ "Table",
56
+ "TableRow",
57
+ "TableOfContents",
58
+ "Template",
59
+ "ToDo",
60
+ "Toggle",
61
+ "Unsupported",
62
+ "Video",
63
+ ]
@@ -0,0 +1,40 @@
1
+ # https://developers.notion.com/reference/block#bookmark
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Bookmark(BlockBase):
14
+ url: str
15
+ caption: List[RichText] = field(default_factory=list)
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ captions = data.pop("caption", [])
20
+ return cls(
21
+ url=data["url"],
22
+ caption=[RichText.from_dict(c) for c in captions],
23
+ )
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ texts = []
27
+ if self.url:
28
+ texts.append(A([Href(self.url)], self.url))
29
+ if self.caption:
30
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
31
+ if not texts:
32
+ return None
33
+ joined = [Br()] * (len(texts) * 2 - 1)
34
+ joined[0::2] = texts
35
+
36
+ return Div([], joined)
37
+
38
+ @staticmethod
39
+ def can_have_children() -> bool:
40
+ return False
@@ -0,0 +1,21 @@
1
+ # https://developers.notion.com/reference/block#breadcrumb
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class Breadcrumb(BlockBase):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return False
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict):
18
+ return cls()
19
+
20
+ def get_html(self) -> Optional[HtmlTag]:
21
+ pass
@@ -0,0 +1,31 @@
1
+ # https://developers.notion.com/reference/block#bulleted-list-item
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Li
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class BulletedListItem(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ return cls(
25
+ color=data["color"],
26
+ children=data.get("children", []),
27
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
28
+ )
29
+
30
+ def get_html(self) -> Optional[HtmlTag]:
31
+ return Li([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,131 @@
1
+ # https://developers.notion.com/reference/block#callout
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional, Union
4
+
5
+ from htmlBuilder.attributes import Href, Style
6
+ from htmlBuilder.tags import A, Div, HtmlTag, P
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import (
9
+ BlockBase,
10
+ FromJSONMixin,
11
+ GetHTMLMixin,
12
+ )
13
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
14
+
15
+
16
+ @dataclass
17
+ class EmojiIcon(FromJSONMixin, GetHTMLMixin):
18
+ emoji: str
19
+ type: str = "emoji"
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ return cls(**data)
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ return P([], self.emoji)
27
+
28
+
29
+ @dataclass
30
+ class ExternalIconContent(FromJSONMixin):
31
+ url: str
32
+
33
+ @classmethod
34
+ def from_dict(cls, data: dict):
35
+ return cls(**data)
36
+
37
+
38
+ @dataclass
39
+ class ExternalIcon(FromJSONMixin, GetHTMLMixin):
40
+ external: ExternalIconContent
41
+ type: str = "external"
42
+
43
+ @classmethod
44
+ def from_dict(cls, data: dict):
45
+ return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data)
46
+
47
+ def get_html(self) -> Optional[HtmlTag]:
48
+ if self.external:
49
+ return A([Href(self.external.url)], [self.external.url])
50
+ else:
51
+ return None
52
+
53
+
54
+ @dataclass
55
+ class FileIconContent(FromJSONMixin):
56
+ url: str
57
+ expiry_time: Optional[str] = None # Add expiry_time if needed
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: dict):
61
+ # Only include expiry_time if it exists in the dictionary
62
+ # Notion API might not always include it
63
+ init_data = {"url": data.get("url")}
64
+ if "expiry_time" in data:
65
+ init_data["expiry_time"] = data.get("expiry_time")
66
+ return cls(**init_data)
67
+
68
+
69
+ @dataclass
70
+ class FileIcon(FromJSONMixin, GetHTMLMixin):
71
+ file: FileIconContent
72
+ type: str = "file"
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict):
76
+ return cls(file=FileIconContent.from_dict(data=data.pop("file")), **data)
77
+
78
+ def get_html(self) -> Optional[HtmlTag]:
79
+ # Render the file URL, similar to how ExternalIcon is handled
80
+ if self.file:
81
+ # Could potentially render an <img> tag, but sticking to URL for consistency
82
+ return A([Href(self.file.url)], [f"[File Icon: {self.file.url}]"])
83
+ else:
84
+ return None
85
+
86
+
87
+ class Icon(FromJSONMixin):
88
+ @classmethod
89
+ def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon, FileIcon]:
90
+ t = data.get("type")
91
+ if t == "emoji":
92
+ return EmojiIcon.from_dict(data)
93
+ elif t == "external":
94
+ return ExternalIcon.from_dict(data)
95
+ elif t == "file":
96
+ return FileIcon.from_dict(data)
97
+ else:
98
+ raise ValueError(f"Unexpected icon type: {t} ({data})")
99
+
100
+
101
+ @dataclass
102
+ class Callout(BlockBase):
103
+ color: str
104
+ icon: Optional[Union[EmojiIcon, ExternalIcon, FileIcon]] = None
105
+ rich_text: List[RichText] = field(default_factory=list)
106
+
107
+ @staticmethod
108
+ def can_have_children() -> bool:
109
+ return True
110
+
111
+ @classmethod
112
+ def from_dict(cls, data: dict):
113
+ rich_text = data.pop("rich_text", [])
114
+ icon_data = data.pop("icon", None)
115
+ icon = Icon.from_dict(icon_data) if icon_data else None
116
+ return cls(
117
+ color=data["color"],
118
+ icon=icon,
119
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
120
+ )
121
+
122
+ def get_html(self) -> Optional[HtmlTag]:
123
+ elements = []
124
+ if self.icon and self.icon.get_html():
125
+ elements.append(self.icon.get_html())
126
+ if self.rich_text:
127
+ elements.extend([rt.get_html() for rt in self.rich_text])
128
+ attributes = []
129
+ if self.color:
130
+ attributes.append(Style(f"color:{self.color}"))
131
+ return Div(attributes, elements)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#child-database
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, P
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class ChildDatabase(BlockBase):
12
+ title: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return True
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return P([], self.title)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#child-page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, P
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase, GetHTMLMixin
8
+
9
+
10
+ @dataclass
11
+ class ChildPage(BlockBase, GetHTMLMixin):
12
+ title: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return True
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return P([], self.title)
@@ -0,0 +1,43 @@
1
+ # https://developers.notion.com/reference/block#code
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Br, Div, HtmlTag
6
+ from htmlBuilder.tags import Code as HtmlCode
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Code(BlockBase):
14
+ language: str
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+ caption: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return False
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ caption = data.pop("caption", [])
26
+ return cls(
27
+ language=data["language"],
28
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
29
+ caption=[RichText.from_dict(c) for c in caption],
30
+ )
31
+
32
+ def get_html(self) -> Optional[HtmlTag]:
33
+ texts = []
34
+ if self.rich_text:
35
+ texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text]))
36
+ if self.caption:
37
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
38
+ if not texts:
39
+ return None
40
+ joined = [Br()] * (len(texts) * 2 - 1)
41
+ joined[0::2] = texts
42
+
43
+ return Div([], joined)
@@ -0,0 +1,35 @@
1
+ # https://developers.notion.com/reference/block#column-list-and-column
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class ColumnList(BlockBase):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return True
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict):
18
+ return cls()
19
+
20
+ def get_html(self) -> Optional[HtmlTag]:
21
+ return None
22
+
23
+
24
+ @dataclass
25
+ class Column(BlockBase):
26
+ @staticmethod
27
+ def can_have_children() -> bool:
28
+ return True
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict):
32
+ return cls()
33
+
34
+ def get_html(self) -> Optional[HtmlTag]:
35
+ return None
@@ -0,0 +1,22 @@
1
+ # https://developers.notion.com/reference/block#divider
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Hr, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+
10
+
11
+ @dataclass
12
+ class Divider(BlockBase):
13
+ @staticmethod
14
+ def can_have_children() -> bool:
15
+ return False
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ return cls()
20
+
21
+ def get_html(self) -> Optional[HtmlTag]:
22
+ return Hr([Style("border-top: 3px solid #bbb")])
@@ -0,0 +1,36 @@
1
+ # https://developers.notion.com/reference/block#embed
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
9
+ from unstructured_ingest.processes.connectors.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Embed(BlockBase):
14
+ url: str
15
+ caption: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return False
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data)
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ texts = []
27
+ if self.url:
28
+ texts.append(A([Href(self.url)], self.url))
29
+ if self.caption:
30
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
31
+ if not texts:
32
+ return None
33
+ joined = [Br()] * (len(texts) * 2 - 1)
34
+ joined[0::2] = texts
35
+
36
+ return Div([], joined)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#equation
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.processes.connectors.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class Equation(BlockBase):
12
+ expression: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return False
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return Div([], self.expression)