unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/README.md +28 -0
  4. unstructured_ingest/cli/__init__.py +0 -0
  5. unstructured_ingest/cli/base/__init__.py +4 -0
  6. unstructured_ingest/cli/base/cmd.py +269 -0
  7. unstructured_ingest/cli/base/dest.py +84 -0
  8. unstructured_ingest/cli/base/importer.py +34 -0
  9. unstructured_ingest/cli/base/src.py +75 -0
  10. unstructured_ingest/cli/cli.py +24 -0
  11. unstructured_ingest/cli/cmds.py +14 -0
  12. unstructured_ingest/cli/utils/__init__.py +0 -0
  13. unstructured_ingest/cli/utils/click.py +237 -0
  14. unstructured_ingest/cli/utils/model_conversion.py +222 -0
  15. unstructured_ingest/data_types/__init__.py +0 -0
  16. unstructured_ingest/data_types/entities.py +17 -0
  17. unstructured_ingest/data_types/file_data.py +116 -0
  18. unstructured_ingest/embed/__init__.py +0 -0
  19. unstructured_ingest/embed/azure_openai.py +63 -0
  20. unstructured_ingest/embed/bedrock.py +323 -0
  21. unstructured_ingest/embed/huggingface.py +69 -0
  22. unstructured_ingest/embed/interfaces.py +146 -0
  23. unstructured_ingest/embed/mixedbreadai.py +134 -0
  24. unstructured_ingest/embed/octoai.py +133 -0
  25. unstructured_ingest/embed/openai.py +142 -0
  26. unstructured_ingest/embed/togetherai.py +116 -0
  27. unstructured_ingest/embed/vertexai.py +109 -0
  28. unstructured_ingest/embed/voyageai.py +130 -0
  29. unstructured_ingest/error.py +156 -0
  30. unstructured_ingest/errors_v2.py +156 -0
  31. unstructured_ingest/interfaces/__init__.py +27 -0
  32. unstructured_ingest/interfaces/connector.py +56 -0
  33. unstructured_ingest/interfaces/downloader.py +90 -0
  34. unstructured_ingest/interfaces/indexer.py +29 -0
  35. unstructured_ingest/interfaces/process.py +22 -0
  36. unstructured_ingest/interfaces/processor.py +88 -0
  37. unstructured_ingest/interfaces/upload_stager.py +89 -0
  38. unstructured_ingest/interfaces/uploader.py +67 -0
  39. unstructured_ingest/logger.py +39 -0
  40. unstructured_ingest/main.py +11 -0
  41. unstructured_ingest/otel.py +128 -0
  42. unstructured_ingest/pipeline/__init__.py +0 -0
  43. unstructured_ingest/pipeline/interfaces.py +211 -0
  44. unstructured_ingest/pipeline/otel.py +32 -0
  45. unstructured_ingest/pipeline/pipeline.py +408 -0
  46. unstructured_ingest/pipeline/steps/__init__.py +0 -0
  47. unstructured_ingest/pipeline/steps/chunk.py +78 -0
  48. unstructured_ingest/pipeline/steps/download.py +206 -0
  49. unstructured_ingest/pipeline/steps/embed.py +77 -0
  50. unstructured_ingest/pipeline/steps/filter.py +35 -0
  51. unstructured_ingest/pipeline/steps/index.py +86 -0
  52. unstructured_ingest/pipeline/steps/partition.py +77 -0
  53. unstructured_ingest/pipeline/steps/stage.py +65 -0
  54. unstructured_ingest/pipeline/steps/uncompress.py +50 -0
  55. unstructured_ingest/pipeline/steps/upload.py +58 -0
  56. unstructured_ingest/processes/__init__.py +18 -0
  57. unstructured_ingest/processes/chunker.py +131 -0
  58. unstructured_ingest/processes/connector_registry.py +69 -0
  59. unstructured_ingest/processes/connectors/__init__.py +129 -0
  60. unstructured_ingest/processes/connectors/airtable.py +238 -0
  61. unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
  62. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
  63. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  64. unstructured_ingest/processes/connectors/astradb.py +592 -0
  65. unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
  66. unstructured_ingest/processes/connectors/chroma.py +193 -0
  67. unstructured_ingest/processes/connectors/confluence.py +527 -0
  68. unstructured_ingest/processes/connectors/couchbase.py +336 -0
  69. unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
  70. unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
  71. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
  72. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
  73. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
  74. unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
  75. unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
  76. unstructured_ingest/processes/connectors/delta_table.py +310 -0
  77. unstructured_ingest/processes/connectors/discord.py +161 -0
  78. unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
  79. unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
  80. unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
  81. unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
  82. unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
  83. unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
  84. unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
  85. unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
  86. unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
  87. unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
  88. unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
  89. unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
  90. unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
  91. unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
  92. unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
  93. unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
  94. unstructured_ingest/processes/connectors/github.py +226 -0
  95. unstructured_ingest/processes/connectors/gitlab.py +270 -0
  96. unstructured_ingest/processes/connectors/google_drive.py +848 -0
  97. unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
  98. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
  99. unstructured_ingest/processes/connectors/jira.py +522 -0
  100. unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
  101. unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
  102. unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
  103. unstructured_ingest/processes/connectors/kafka/local.py +103 -0
  104. unstructured_ingest/processes/connectors/kdbai.py +156 -0
  105. unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
  106. unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
  107. unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
  108. unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
  109. unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
  110. unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
  111. unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
  112. unstructured_ingest/processes/connectors/local.py +227 -0
  113. unstructured_ingest/processes/connectors/milvus.py +311 -0
  114. unstructured_ingest/processes/connectors/mongodb.py +389 -0
  115. unstructured_ingest/processes/connectors/neo4j.py +534 -0
  116. unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
  117. unstructured_ingest/processes/connectors/notion/client.py +349 -0
  118. unstructured_ingest/processes/connectors/notion/connector.py +350 -0
  119. unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
  120. unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
  121. unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
  122. unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
  123. unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
  124. unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
  125. unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
  126. unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
  127. unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
  128. unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  129. unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  130. unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  131. unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
  132. unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
  133. unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
  134. unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
  135. unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
  136. unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
  137. unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
  138. unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
  139. unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
  140. unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
  141. unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
  142. unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  143. unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  144. unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  145. unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  146. unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
  147. unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
  148. unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
  149. unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
  150. unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  151. unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
  152. unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
  153. unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
  154. unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  155. unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
  156. unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
  157. unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
  158. unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
  159. unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
  160. unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
  161. unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
  162. unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
  163. unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
  164. unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
  165. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  166. unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
  167. unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
  168. unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
  169. unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
  170. unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
  171. unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
  172. unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
  173. unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
  174. unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
  175. unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
  176. unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
  177. unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
  178. unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
  179. unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
  180. unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
  181. unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
  182. unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
  183. unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
  184. unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
  185. unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
  186. unstructured_ingest/processes/connectors/onedrive.py +485 -0
  187. unstructured_ingest/processes/connectors/outlook.py +242 -0
  188. unstructured_ingest/processes/connectors/pinecone.py +400 -0
  189. unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
  190. unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
  191. unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
  192. unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
  193. unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
  194. unstructured_ingest/processes/connectors/redisdb.py +214 -0
  195. unstructured_ingest/processes/connectors/salesforce.py +307 -0
  196. unstructured_ingest/processes/connectors/sharepoint.py +282 -0
  197. unstructured_ingest/processes/connectors/slack.py +249 -0
  198. unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
  199. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
  200. unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
  201. unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
  202. unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
  203. unstructured_ingest/processes/connectors/sql/sql.py +456 -0
  204. unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
  205. unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
  206. unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
  207. unstructured_ingest/processes/connectors/utils.py +60 -0
  208. unstructured_ingest/processes/connectors/vectara.py +348 -0
  209. unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
  210. unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
  211. unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
  212. unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
  213. unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
  214. unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
  215. unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
  216. unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
  217. unstructured_ingest/processes/embedder.py +203 -0
  218. unstructured_ingest/processes/filter.py +60 -0
  219. unstructured_ingest/processes/partitioner.py +233 -0
  220. unstructured_ingest/processes/uncompress.py +61 -0
  221. unstructured_ingest/processes/utils/__init__.py +8 -0
  222. unstructured_ingest/processes/utils/blob_storage.py +32 -0
  223. unstructured_ingest/processes/utils/logging/connector.py +365 -0
  224. unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
  225. unstructured_ingest/unstructured_api.py +140 -0
  226. unstructured_ingest/utils/__init__.py +5 -0
  227. unstructured_ingest/utils/chunking.py +56 -0
  228. unstructured_ingest/utils/compression.py +72 -0
  229. unstructured_ingest/utils/constants.py +2 -0
  230. unstructured_ingest/utils/data_prep.py +216 -0
  231. unstructured_ingest/utils/dep_check.py +78 -0
  232. unstructured_ingest/utils/filesystem.py +27 -0
  233. unstructured_ingest/utils/html.py +174 -0
  234. unstructured_ingest/utils/ndjson.py +52 -0
  235. unstructured_ingest/utils/pydantic_models.py +52 -0
  236. unstructured_ingest/utils/string_and_date_utils.py +74 -0
  237. unstructured_ingest/utils/table.py +80 -0
  238. unstructured_ingest/utils/tls.py +15 -0
  239. unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
  240. unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
  241. unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
  242. unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
  243. unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
@@ -0,0 +1,448 @@
1
+ import enum
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional, Tuple
5
+ from urllib.parse import urlparse
6
+ from uuid import UUID
7
+
8
+ from htmlBuilder.attributes import Style
9
+ from htmlBuilder.tags import (
10
+ Body,
11
+ Div,
12
+ Head,
13
+ Html,
14
+ HtmlTag,
15
+ Ol,
16
+ Table,
17
+ Td,
18
+ Th,
19
+ Title,
20
+ Tr,
21
+ Ul,
22
+ )
23
+ from notion_client.errors import APIResponseError
24
+
25
+ import unstructured_ingest.processes.connectors.notion.types.blocks as notion_blocks
26
+ from unstructured_ingest.processes.connectors.notion.client import Client
27
+ from unstructured_ingest.processes.connectors.notion.types.block import Block
28
+ from unstructured_ingest.processes.connectors.notion.types.database import Database
29
+
30
+
31
+ @dataclass
32
+ class HtmlExtractionResponse:
33
+ html: Optional[HtmlTag] = None
34
+ child_pages: List[str] = field(default_factory=list)
35
+ child_databases: List[str] = field(default_factory=list)
36
+
37
+
38
+ def process_block(
39
+ current_block: dict,
40
+ parent_page_id: str,
41
+ client: Client,
42
+ child_pages: list,
43
+ child_databases: list,
44
+ ) -> Tuple[dict, list, list, dict]:
45
+ if isinstance(current_block["block"].block, notion_blocks.ChildPage) and current_block[
46
+ "block"
47
+ ].id != str(parent_page_id):
48
+ child_pages.append(current_block["block"].id)
49
+ return {}, child_pages, child_databases
50
+ if isinstance(current_block["block"].block, notion_blocks.ChildDatabase):
51
+ child_databases.append(current_block["block"].id)
52
+ return {}, child_pages, child_databases
53
+
54
+ # recursively go through all blocks in a page, store each block in a dictionary
55
+ if current_block["block"].has_children:
56
+ children = []
57
+ for children_block in client.blocks.children.iterate_list(
58
+ block_id=current_block["block"].id
59
+ ):
60
+ children.extend(children_block)
61
+ if children:
62
+ for child in children:
63
+ child_block = {
64
+ "block": child,
65
+ "level": current_block["level"] + 1,
66
+ "children": [],
67
+ "parent_id": current_block["block"].id,
68
+ }
69
+ child_element, child_pages, child_databases = process_block(
70
+ child_block, parent_page_id, client, child_pages, child_databases
71
+ )
72
+ current_block["children"].append(child_element)
73
+ return current_block, child_pages, child_databases
74
+
75
+
76
+ def flush_list(type: str, item_list: list, html: list) -> Tuple[list, list]:
77
+ margin_left = 10 * (item_list[-1][1] - 1)
78
+ style = Style(f"margin-left: {margin_left}px")
79
+ if type == "bulleted_list":
80
+ html.append(Ul([style], [item[2] for item in item_list]))
81
+ else:
82
+ html.append(Ol([style], [item[2] for item in item_list]))
83
+ return [], html
84
+
85
+
86
+ def build_html(
87
+ current_block: dict, bulleted_list: list, numbered_list: list
88
+ ) -> Tuple[list, list, list]:
89
+ html = []
90
+ # extract current block's html
91
+ if isinstance(current_block["block"].block, notion_blocks.BulletedListItem):
92
+ if bulleted_list and current_block["parent_id"] != bulleted_list[-1][0]:
93
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
94
+ bulleted_list.append(
95
+ (current_block["parent_id"], current_block["level"], current_block["block"].get_html())
96
+ )
97
+ if bulleted_list and current_block["peers_rank"] == current_block["peers_count"] - 1:
98
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
99
+ elif isinstance(current_block["block"].block, notion_blocks.NumberedListItem):
100
+ if numbered_list and current_block["parent_id"] != numbered_list[-1][0]:
101
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
102
+ numbered_list.append(
103
+ (current_block["parent_id"], current_block["level"], current_block["block"].get_html())
104
+ )
105
+ if numbered_list and current_block["peers_rank"] == current_block["peers_count"] - 1:
106
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
107
+ else:
108
+ if bulleted_list:
109
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
110
+ if numbered_list:
111
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
112
+ if (
113
+ isinstance(current_block["block"].block, notion_blocks.TableRow)
114
+ and current_block["peers_rank"] == 0
115
+ ):
116
+ current_block["block"].is_header = True
117
+ if current_block["block"].get_html():
118
+ html.append(current_block["block"].get_html())
119
+ else:
120
+ html.append([])
121
+ # process current block's children
122
+ if current_block["children"]:
123
+ children_html = []
124
+ for index, child in enumerate(current_block["children"]):
125
+ if child:
126
+ child["peers_rank"] = index
127
+ child["peers_count"] = len(current_block["children"])
128
+ child_html, bulleted_list, numbered_list = build_html(
129
+ child, bulleted_list, numbered_list
130
+ )
131
+ if child_html:
132
+ children_html.append(child_html)
133
+ if isinstance(current_block["block"].block, notion_blocks.Column):
134
+ html.append(
135
+ Div(
136
+ [Style(f"width:{100 / current_block['peers_count']}%; float: left")],
137
+ children_html,
138
+ )
139
+ )
140
+ elif isinstance(current_block["block"].block, notion_blocks.Table):
141
+ html.append(Table([], children_html))
142
+ else:
143
+ html.append(Div([], children_html))
144
+
145
+ return html, bulleted_list, numbered_list
146
+
147
+
148
+ def extract_page_html(
149
+ client: Client,
150
+ page_id: str,
151
+ logger: logging.Logger,
152
+ ) -> HtmlExtractionResponse:
153
+ parent_page_id = UUID(page_id)
154
+ parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore
155
+ head = None
156
+ if isinstance(parent_block.block, notion_blocks.ChildPage):
157
+ head = Head([], Title([], parent_block.block.title))
158
+ current_block = {
159
+ "block": parent_block,
160
+ "level": 0,
161
+ "children": [],
162
+ "parent_id": None,
163
+ "peers_rank": 0,
164
+ "peers_count": 1,
165
+ }
166
+ logger.debug(f"processing page id: {page_id}")
167
+ current_block, child_pages, child_databases = process_block(
168
+ current_block, parent_page_id, client, [], []
169
+ )
170
+ html, _, _ = build_html(current_block, [], [])
171
+ body = Body([], html)
172
+ all_elements = [body]
173
+ if head:
174
+ all_elements = [head] + all_elements
175
+ full_html = Html([], all_elements)
176
+ return HtmlExtractionResponse(
177
+ full_html,
178
+ child_pages=child_pages,
179
+ child_databases=child_databases,
180
+ )
181
+
182
+
183
+ def extract_database_html(
184
+ client: Client,
185
+ database_id: str,
186
+ logger: logging.Logger,
187
+ ) -> HtmlExtractionResponse:
188
+ logger.debug(f"processing database id: {database_id}")
189
+ database: Database = client.databases.retrieve(database_id=database_id) # type: ignore
190
+ property_keys = list(database.properties.keys())
191
+ property_keys = sorted(property_keys)
192
+ table_html_rows = []
193
+ child_pages: List[str] = []
194
+ child_databases: List[str] = []
195
+ # Create header row
196
+ table_html_rows.append(Tr([], [Th([], k) for k in property_keys]))
197
+
198
+ all_pages = []
199
+ for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore
200
+ all_pages.extend(page_chunk)
201
+
202
+ logger.debug(f"creating {len(all_pages)} rows")
203
+ for page in all_pages:
204
+ if is_database_url(client=client, url=page.url):
205
+ child_databases.append(page.id)
206
+ if is_page_url(client=client, url=page.url):
207
+ child_pages.append(page.id)
208
+ properties = page.properties
209
+ inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore
210
+ table_html_rows.append(
211
+ Tr(
212
+ [],
213
+ [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]],
214
+ ),
215
+ )
216
+
217
+ table_html = Table([], table_html_rows)
218
+
219
+ return HtmlExtractionResponse(
220
+ html=table_html,
221
+ child_pages=child_pages,
222
+ child_databases=child_databases,
223
+ )
224
+
225
+
226
+ @dataclass
227
+ class ChildExtractionResponse:
228
+ child_pages: List[str] = field(default_factory=list)
229
+ child_databases: List[str] = field(default_factory=list)
230
+
231
+
232
+ class QueueEntryType(enum.Enum):
233
+ DATABASE = "database"
234
+ PAGE = "page"
235
+
236
+
237
+ @dataclass
238
+ class QueueEntry:
239
+ type: QueueEntryType
240
+ id: UUID
241
+
242
+
243
+ def get_recursive_content_from_page(
244
+ client: Client,
245
+ page_id: str,
246
+ logger: logging.Logger,
247
+ ) -> ChildExtractionResponse:
248
+ return get_recursive_content(
249
+ client=client,
250
+ init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)),
251
+ logger=logger,
252
+ )
253
+
254
+
255
+ def get_recursive_content_from_database(
256
+ client: Client,
257
+ database_id: str,
258
+ logger: logging.Logger,
259
+ ) -> ChildExtractionResponse:
260
+ return get_recursive_content(
261
+ client=client,
262
+ init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
263
+ logger=logger,
264
+ )
265
+
266
+
267
+ def get_recursive_content(
268
+ client: Client,
269
+ init_entry: QueueEntry,
270
+ logger: logging.Logger,
271
+ ) -> ChildExtractionResponse:
272
+ parents: List[QueueEntry] = [init_entry]
273
+ child_pages: List[str] = []
274
+ child_dbs: List[str] = []
275
+ processed: List[str] = []
276
+ while len(parents) > 0:
277
+ parent: QueueEntry = parents.pop()
278
+ processed.append(str(parent.id))
279
+ if parent.type == QueueEntryType.PAGE:
280
+ logger.debug(f"getting child data from page: {parent.id}")
281
+ page_children = []
282
+ try:
283
+ for children_block in client.blocks.children.iterate_list( # type: ignore
284
+ block_id=str(parent.id),
285
+ ):
286
+ page_children.extend(children_block)
287
+ except APIResponseError as api_error:
288
+ logger.error(f"failed to get page with id {parent.id}: {api_error}")
289
+ if str(parent.id) in child_pages:
290
+ child_pages.remove(str(parent.id))
291
+ continue
292
+ if not page_children:
293
+ continue
294
+
295
+ # Extract child pages
296
+ child_pages_from_page = [
297
+ c for c in page_children if isinstance(c.block, notion_blocks.ChildPage)
298
+ ]
299
+ if child_pages_from_page:
300
+ child_page_blocks: List[notion_blocks.ChildPage] = [
301
+ p.block
302
+ for p in child_pages_from_page
303
+ if isinstance(p.block, notion_blocks.ChildPage)
304
+ ]
305
+ logger.debug(
306
+ "found child pages from parent page {}: {}".format(
307
+ parent.id,
308
+ ", ".join([block.title for block in child_page_blocks]),
309
+ ),
310
+ )
311
+ new_pages = [p.id for p in child_pages_from_page if p.id not in processed]
312
+ new_pages = list(set(new_pages))
313
+ child_pages.extend(new_pages)
314
+ parents.extend(
315
+ [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
316
+ )
317
+
318
+ # Extract child databases
319
+ child_dbs_from_page = [
320
+ c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase)
321
+ ]
322
+ if child_dbs_from_page:
323
+ child_db_blocks: List[notion_blocks.ChildDatabase] = [
324
+ c.block
325
+ for c in page_children
326
+ if isinstance(c.block, notion_blocks.ChildDatabase)
327
+ ]
328
+ logger.debug(
329
+ "found child database from parent page {}: {}".format(
330
+ parent.id,
331
+ ", ".join([block.title for block in child_db_blocks]),
332
+ ),
333
+ )
334
+ new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed]
335
+ new_dbs = list(set(new_dbs))
336
+ child_dbs.extend(new_dbs)
337
+ parents.extend(
338
+ [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
339
+ )
340
+
341
+ linked_to_others: List[notion_blocks.LinkToPage] = [
342
+ c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage)
343
+ ]
344
+ for link in linked_to_others:
345
+ if (page_id := link.page_id) and (
346
+ page_id not in processed and page_id not in child_pages
347
+ ):
348
+ child_pages.append(page_id)
349
+ parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)))
350
+ if (database_id := link.database_id) and (
351
+ database_id not in processed and database_id not in child_dbs
352
+ ):
353
+ child_dbs.append(database_id)
354
+ parents.append(
355
+ QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
356
+ )
357
+
358
+ elif parent.type == QueueEntryType.DATABASE:
359
+ logger.debug(f"getting child data from database: {parent.id}")
360
+ database_pages = []
361
+ try:
362
+ for page_entries in client.databases.iterate_query( # type: ignore
363
+ database_id=str(parent.id),
364
+ ):
365
+ database_pages.extend(page_entries)
366
+ except APIResponseError as api_error:
367
+ logger.error(f"failed to get database with id {parent.id}: {api_error}")
368
+ if str(parent.id) in child_dbs:
369
+ child_dbs.remove(str(parent.id))
370
+ continue
371
+ if not database_pages:
372
+ continue
373
+
374
+ child_pages_from_db = [
375
+ p for p in database_pages if is_page_url(client=client, url=p.url)
376
+ ]
377
+ if child_pages_from_db:
378
+ logger.debug(
379
+ "found child pages from parent database {}: {}".format(
380
+ parent.id,
381
+ ", ".join([p.url for p in child_pages_from_db]),
382
+ ),
383
+ )
384
+ new_pages = [p.id for p in child_pages_from_db if p.id not in processed]
385
+ child_pages.extend(new_pages)
386
+ parents.extend(
387
+ [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
388
+ )
389
+
390
+ child_dbs_from_db = [
391
+ p for p in database_pages if is_database_url(client=client, url=p.url)
392
+ ]
393
+ if child_dbs_from_db:
394
+ logger.debug(
395
+ "found child database from parent database {}: {}".format(
396
+ parent.id,
397
+ ", ".join([db.url for db in child_dbs_from_db]),
398
+ ),
399
+ )
400
+ new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed]
401
+ child_dbs.extend(new_dbs)
402
+ parents.extend(
403
+ [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
404
+ )
405
+
406
+ return ChildExtractionResponse(
407
+ child_pages=child_pages,
408
+ child_databases=child_dbs,
409
+ )
410
+
411
+
412
+ def is_valid_uuid(uuid_str: str) -> bool:
413
+ try:
414
+ UUID(uuid_str)
415
+ return True
416
+ except Exception:
417
+ return False
418
+
419
+
420
+ def get_uuid_from_url(path: str) -> Optional[str]:
421
+ strings = path.split("-")
422
+ if len(strings) > 0 and is_valid_uuid(strings[-1]):
423
+ return strings[-1]
424
+ return None
425
+
426
+
427
+ def is_page_url(client: Client, url: str):
428
+ parsed_url = urlparse(url)
429
+ path = parsed_url.path.split("/")[-1]
430
+ if parsed_url.netloc != "www.notion.so":
431
+ return False
432
+ page_uuid = get_uuid_from_url(path=path)
433
+ if not page_uuid:
434
+ return False
435
+ check_resp = client.pages.retrieve_status(page_id=page_uuid)
436
+ return check_resp == 200
437
+
438
+
439
+ def is_database_url(client: Client, url: str):
440
+ parsed_url = urlparse(url)
441
+ path = parsed_url.path.split("/")[-1]
442
+ if parsed_url.netloc != "www.notion.so":
443
+ return False
444
+ database_uuid = get_uuid_from_url(path=path)
445
+ if not database_uuid:
446
+ return False
447
+ check_resp = client.databases.retrieve_status(database_id=database_uuid)
448
+ return check_resp == 200
@@ -0,0 +1,3 @@
1
+ from ._wrapper import RetryHandler
2
+
3
+ __all__ = ["RetryHandler"]
@@ -0,0 +1,102 @@
1
+ import logging
2
+ import sys
3
+ import traceback
4
+
5
+
6
+ # Default startup handler
7
+ def _log_start(details, logger, log_level):
8
+ max_tried = details.get("max_tries")
9
+ max_time = details.get("max_time")
10
+ if max_tried is not None and max_time is not None:
11
+ s = "%.1fs or %d tries"
12
+ s_args = [max_time, max_tried]
13
+ elif max_tried is not None:
14
+ s = "%d tries"
15
+ s_args = [max_tried]
16
+ else:
17
+ s = "%.1fs"
18
+ s_args = [max_time]
19
+ exception = details.get("exception")
20
+ if isinstance(exception, tuple):
21
+ exception = list(exception)
22
+ elif not isinstance(exception, list):
23
+ exception = [exception]
24
+ exception_s = ", ".join([e.__name__ for e in exception])
25
+ if log_level >= logging.INFO:
26
+ msg = f"Attempting %s(...), will retry for {s} given these issues: %s"
27
+ log_args = [details["target"].__name__] + s_args + [exception_s]
28
+ else:
29
+ msg = f"Attempting %s(%s), will retry for {s} given these issues: %s"
30
+ target_input_list = []
31
+ if args := details.get("args"):
32
+ target_input_list.extend([str(d) for d in args])
33
+ if kwargs := details.get("kwargs"):
34
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
35
+ target_input = ", ".join(target_input_list) if target_input_list else ""
36
+ log_args = (
37
+ [
38
+ details["target"].__name__,
39
+ target_input,
40
+ ]
41
+ + s_args
42
+ + [exception_s]
43
+ )
44
+ logger.log(log_level, msg, *log_args)
45
+
46
+
47
+ # Default backoff handler
48
+ def _log_backoff(details, logger, log_level):
49
+ if log_level >= logging.INFO:
50
+ msg = "Backing off %s(...) for %.1fs (%s)"
51
+ log_args = [details["target"].__name__, details["tries"]]
52
+ else:
53
+ msg = "Backing off %.1fs seconds after %d tries calling function %s(%s) -> %s"
54
+ target_input_list = []
55
+ if args := details.get("args"):
56
+ target_input_list.extend([str(d) for d in args])
57
+ if kwargs := details.get("kwargs"):
58
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
59
+ target_input = ", ".join(target_input_list) if target_input_list else ""
60
+ log_args = [
61
+ details["wait"],
62
+ details["tries"],
63
+ details["target"].__name__,
64
+ target_input,
65
+ ]
66
+ exc_typ, exc, _ = sys.exc_info()
67
+ if exc is not None:
68
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
69
+ log_args.append(exc_fmt.rstrip("\n"))
70
+ else:
71
+ log_args.append(str(details["value"]))
72
+ logger.log(log_level, msg, *log_args)
73
+
74
+
75
+ # Default giveup handler
76
+ def _log_giveup(details, logger, log_level):
77
+ if log_level >= logging.INFO:
78
+ msg = "Giving up %s(...) after %.1fs (%s)"
79
+ log_args = [details["target"].__name__, details["tries"]]
80
+ else:
81
+ msg = "Giving up after %d tries (%.1fs) calling function %s(%s) -> %s"
82
+ target_input_list = []
83
+ if args := details.get("args"):
84
+ target_input_list.extend([str(d) for d in args])
85
+ if kwargs := details.get("kwargs"):
86
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
87
+ target_input = ", ".join(target_input_list) if target_input_list else "..."
88
+ log_args = [
89
+ details["tries"],
90
+ details["wait"],
91
+ details["target"].__name__,
92
+ target_input,
93
+ ]
94
+
95
+ exc_typ, exc, _ = sys.exc_info()
96
+ if exc is not None:
97
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
98
+ log_args.append(exc_fmt.rstrip("\n"))
99
+ else:
100
+ log_args.append(details["value"])
101
+
102
+ logger.log(log_level, msg, *log_args)
@@ -0,0 +1,126 @@
1
+ # coding:utf-8
2
+ import logging
3
+ from collections.abc import Iterable as IterableType
4
+ from typing import Any, Iterable, Optional, Type, Union
5
+
6
+ from backoff import _sync
7
+ from backoff._common import _config_handlers, _prepare_logger
8
+ from backoff._jitter import full_jitter
9
+ from backoff._typing import (
10
+ _Handler,
11
+ _Jitterer,
12
+ _MaybeCallable,
13
+ _MaybeLogger,
14
+ _MaybeSequence,
15
+ _Predicate,
16
+ _WaitGenerator,
17
+ )
18
+
19
+ from unstructured_ingest.processes.connectors.notion.ingest_backoff._common import (
20
+ _log_backoff,
21
+ _log_giveup,
22
+ _log_start,
23
+ )
24
+
25
+
26
+ class RetryHandler:
27
+ def __init__(
28
+ self,
29
+ wait_gen: _WaitGenerator,
30
+ exception: _MaybeSequence[Type[Exception]],
31
+ *,
32
+ max_tries: Optional[_MaybeCallable[int]] = None,
33
+ max_time: Optional[_MaybeCallable[float]] = None,
34
+ jitter: Union[_Jitterer, None] = full_jitter,
35
+ giveup: _Predicate[Exception] = lambda e: False,
36
+ on_start: Union[_Handler, Iterable[_Handler], None] = None,
37
+ on_success: Union[_Handler, Iterable[_Handler], None] = None,
38
+ on_backoff: Union[_Handler, Iterable[_Handler], None] = None,
39
+ on_giveup: Union[_Handler, Iterable[_Handler], None] = None,
40
+ raise_on_giveup: bool = True,
41
+ logger: _MaybeLogger = "backoff",
42
+ start_log_level: int = logging.INFO,
43
+ backoff_log_level: int = logging.INFO,
44
+ giveup_log_level: int = logging.ERROR,
45
+ **wait_gen_kwargs: Any,
46
+ ):
47
+ prepared_logger = _prepare_logger(logger)
48
+ on_success = _config_handlers(on_success)
49
+ on_start = _config_handlers(
50
+ on_start,
51
+ default_handler=_log_start,
52
+ logger=prepared_logger,
53
+ log_level=start_log_level,
54
+ )
55
+ on_backoff = _config_handlers(
56
+ on_backoff,
57
+ default_handler=_log_backoff,
58
+ logger=prepared_logger,
59
+ log_level=backoff_log_level,
60
+ )
61
+ on_giveup = _config_handlers(
62
+ on_giveup,
63
+ default_handler=_log_giveup,
64
+ logger=prepared_logger,
65
+ log_level=giveup_log_level,
66
+ )
67
+ prepared_logger.debug(
68
+ "Initiating retry handler with "
69
+ "max_tries={}, "
70
+ "max_time={}, "
71
+ "exception={}, "
72
+ "start_log_level={}, "
73
+ "backoff_log_level={}, "
74
+ "giveup_log_level={}".format(
75
+ max_tries,
76
+ max_time,
77
+ (
78
+ ", ".join([e.__name__ for e in exception])
79
+ if isinstance(exception, IterableType)
80
+ else exception.__name__
81
+ ),
82
+ logging.getLevelName(start_log_level),
83
+ logging.getLevelName(backoff_log_level),
84
+ logging.getLevelName(giveup_log_level),
85
+ ),
86
+ )
87
+ self.on_start = on_start
88
+ self.on_success = on_success
89
+ self.on_backoff = on_backoff
90
+ self.on_giveup = on_giveup
91
+ self.jitter = jitter
92
+ self.giveup = giveup
93
+ self.raise_on_giveup = raise_on_giveup
94
+ self.wait_gen_kwargs = wait_gen_kwargs
95
+ self.wait_gen = wait_gen
96
+ self.exception = exception
97
+ self.max_tries = max_tries
98
+ self.max_time = max_time
99
+
100
+ def __call__(self, target, *args, **kwargs):
101
+ _sync._call_handlers(
102
+ self.on_start,
103
+ target=target,
104
+ args=args,
105
+ kwargs=kwargs,
106
+ tries=None,
107
+ elapsed=None,
108
+ max_tries=self.max_tries,
109
+ max_time=self.max_time,
110
+ exception=self.exception,
111
+ )
112
+ wrapped_func = _sync.retry_exception(
113
+ target,
114
+ self.wait_gen,
115
+ self.exception,
116
+ max_tries=self.max_tries,
117
+ max_time=self.max_time,
118
+ jitter=self.jitter,
119
+ giveup=self.giveup,
120
+ on_success=self.on_success,
121
+ on_backoff=self.on_backoff,
122
+ on_giveup=self.on_giveup,
123
+ raise_on_giveup=self.raise_on_giveup,
124
+ wait_gen_kwargs=self.wait_gen_kwargs,
125
+ )
126
+ return wrapped_func(*args, **kwargs)