unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,448 @@
1
+ import enum
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional, Tuple
5
+ from urllib.parse import urlparse
6
+ from uuid import UUID
7
+
8
+ from htmlBuilder.attributes import Style
9
+ from htmlBuilder.tags import (
10
+ Body,
11
+ Div,
12
+ Head,
13
+ Html,
14
+ HtmlTag,
15
+ Ol,
16
+ Table,
17
+ Td,
18
+ Th,
19
+ Title,
20
+ Tr,
21
+ Ul,
22
+ )
23
+ from notion_client.errors import APIResponseError
24
+
25
+ import unstructured_ingest.v2.processes.connectors.notion.types.blocks as notion_blocks
26
+ from unstructured_ingest.v2.processes.connectors.notion.client import Client
27
+ from unstructured_ingest.v2.processes.connectors.notion.types.block import Block
28
+ from unstructured_ingest.v2.processes.connectors.notion.types.database import Database
29
+
30
+
31
+ @dataclass
32
+ class HtmlExtractionResponse:
33
+ html: Optional[HtmlTag] = None
34
+ child_pages: List[str] = field(default_factory=list)
35
+ child_databases: List[str] = field(default_factory=list)
36
+
37
+
38
+ def process_block(
39
+ current_block: dict,
40
+ parent_page_id: str,
41
+ client: Client,
42
+ child_pages: list,
43
+ child_databases: list,
44
+ ) -> Tuple[dict, list, list, dict]:
45
+ if isinstance(current_block["block"].block, notion_blocks.ChildPage) and current_block[
46
+ "block"
47
+ ].id != str(parent_page_id):
48
+ child_pages.append(current_block["block"].id)
49
+ return {}, child_pages, child_databases
50
+ if isinstance(current_block["block"].block, notion_blocks.ChildDatabase):
51
+ child_databases.append(current_block["block"].id)
52
+ return {}, child_pages, child_databases
53
+
54
+ # recursively go through all blocks in a page, store each block in a dictionary
55
+ if current_block["block"].has_children:
56
+ children = []
57
+ for children_block in client.blocks.children.iterate_list(
58
+ block_id=current_block["block"].id
59
+ ):
60
+ children.extend(children_block)
61
+ if children:
62
+ for child in children:
63
+ child_block = {
64
+ "block": child,
65
+ "level": current_block["level"] + 1,
66
+ "children": [],
67
+ "parent_id": current_block["block"].id,
68
+ }
69
+ child_element, child_pages, child_databases = process_block(
70
+ child_block, parent_page_id, client, child_pages, child_databases
71
+ )
72
+ current_block["children"].append(child_element)
73
+ return current_block, child_pages, child_databases
74
+
75
+
76
+ def flush_list(type: str, item_list: list, html: list) -> Tuple[list, list]:
77
+ margin_left = 10 * (item_list[-1][1] - 1)
78
+ style = Style(f"margin-left: {margin_left}px")
79
+ if type == "bulleted_list":
80
+ html.append(Ul([style], [item[2] for item in item_list]))
81
+ else:
82
+ html.append(Ol([style], [item[2] for item in item_list]))
83
+ return [], html
84
+
85
+
86
+ def build_html(
87
+ current_block: dict, bulleted_list: list, numbered_list: list
88
+ ) -> Tuple[list, list, list]:
89
+ html = []
90
+ # extract current block's html
91
+ if isinstance(current_block["block"].block, notion_blocks.BulletedListItem):
92
+ if bulleted_list and current_block["parent_id"] != bulleted_list[-1][0]:
93
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
94
+ bulleted_list.append(
95
+ (current_block["parent_id"], current_block["level"], current_block["block"].get_html())
96
+ )
97
+ if bulleted_list and current_block["peers_rank"] == current_block["peers_count"] - 1:
98
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
99
+ elif isinstance(current_block["block"].block, notion_blocks.NumberedListItem):
100
+ if numbered_list and current_block["parent_id"] != numbered_list[-1][0]:
101
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
102
+ numbered_list.append(
103
+ (current_block["parent_id"], current_block["level"], current_block["block"].get_html())
104
+ )
105
+ if numbered_list and current_block["peers_rank"] == current_block["peers_count"] - 1:
106
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
107
+ else:
108
+ if bulleted_list:
109
+ bulleted_list, html = flush_list("bulleted_list", bulleted_list, html)
110
+ if numbered_list:
111
+ numbered_list, html = flush_list("numbered_list", numbered_list, html)
112
+ if (
113
+ isinstance(current_block["block"].block, notion_blocks.TableRow)
114
+ and current_block["peers_rank"] == 0
115
+ ):
116
+ current_block["block"].is_header = True
117
+ if current_block["block"].get_html():
118
+ html.append(current_block["block"].get_html())
119
+ else:
120
+ html.append([])
121
+ # process current block's children
122
+ if current_block["children"]:
123
+ children_html = []
124
+ for index, child in enumerate(current_block["children"]):
125
+ if child:
126
+ child["peers_rank"] = index
127
+ child["peers_count"] = len(current_block["children"])
128
+ child_html, bulleted_list, numbered_list = build_html(
129
+ child, bulleted_list, numbered_list
130
+ )
131
+ if child_html:
132
+ children_html.append(child_html)
133
+ if isinstance(current_block["block"].block, notion_blocks.Column):
134
+ html.append(
135
+ Div(
136
+ [Style(f"width:{100 / current_block['peers_count']}%; float: left")],
137
+ children_html,
138
+ )
139
+ )
140
+ elif isinstance(current_block["block"].block, notion_blocks.Table):
141
+ html.append(Table([], children_html))
142
+ else:
143
+ html.append(Div([], children_html))
144
+
145
+ return html, bulleted_list, numbered_list
146
+
147
+
148
+ def extract_page_html(
149
+ client: Client,
150
+ page_id: str,
151
+ logger: logging.Logger,
152
+ ) -> HtmlExtractionResponse:
153
+ parent_page_id = UUID(page_id)
154
+ parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore
155
+ head = None
156
+ if isinstance(parent_block.block, notion_blocks.ChildPage):
157
+ head = Head([], Title([], parent_block.block.title))
158
+ current_block = {
159
+ "block": parent_block,
160
+ "level": 0,
161
+ "children": [],
162
+ "parent_id": None,
163
+ "peers_rank": 0,
164
+ "peers_count": 1,
165
+ }
166
+ logger.debug(f"processing page id: {page_id}")
167
+ current_block, child_pages, child_databases = process_block(
168
+ current_block, parent_page_id, client, [], []
169
+ )
170
+ html, _, _ = build_html(current_block, [], [])
171
+ body = Body([], html)
172
+ all_elements = [body]
173
+ if head:
174
+ all_elements = [head] + all_elements
175
+ full_html = Html([], all_elements)
176
+ return HtmlExtractionResponse(
177
+ full_html,
178
+ child_pages=child_pages,
179
+ child_databases=child_databases,
180
+ )
181
+
182
+
183
+ def extract_database_html(
184
+ client: Client,
185
+ database_id: str,
186
+ logger: logging.Logger,
187
+ ) -> HtmlExtractionResponse:
188
+ logger.debug(f"processing database id: {database_id}")
189
+ database: Database = client.databases.retrieve(database_id=database_id) # type: ignore
190
+ property_keys = list(database.properties.keys())
191
+ property_keys = sorted(property_keys)
192
+ table_html_rows = []
193
+ child_pages: List[str] = []
194
+ child_databases: List[str] = []
195
+ # Create header row
196
+ table_html_rows.append(Tr([], [Th([], k) for k in property_keys]))
197
+
198
+ all_pages = []
199
+ for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore
200
+ all_pages.extend(page_chunk)
201
+
202
+ logger.debug(f"creating {len(all_pages)} rows")
203
+ for page in all_pages:
204
+ if is_database_url(client=client, url=page.url):
205
+ child_databases.append(page.id)
206
+ if is_page_url(client=client, url=page.url):
207
+ child_pages.append(page.id)
208
+ properties = page.properties
209
+ inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore
210
+ table_html_rows.append(
211
+ Tr(
212
+ [],
213
+ [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]],
214
+ ),
215
+ )
216
+
217
+ table_html = Table([], table_html_rows)
218
+
219
+ return HtmlExtractionResponse(
220
+ html=table_html,
221
+ child_pages=child_pages,
222
+ child_databases=child_databases,
223
+ )
224
+
225
+
226
+ @dataclass
227
+ class ChildExtractionResponse:
228
+ child_pages: List[str] = field(default_factory=list)
229
+ child_databases: List[str] = field(default_factory=list)
230
+
231
+
232
+ class QueueEntryType(enum.Enum):
233
+ DATABASE = "database"
234
+ PAGE = "page"
235
+
236
+
237
+ @dataclass
238
+ class QueueEntry:
239
+ type: QueueEntryType
240
+ id: UUID
241
+
242
+
243
+ def get_recursive_content_from_page(
244
+ client: Client,
245
+ page_id: str,
246
+ logger: logging.Logger,
247
+ ) -> ChildExtractionResponse:
248
+ return get_recursive_content(
249
+ client=client,
250
+ init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)),
251
+ logger=logger,
252
+ )
253
+
254
+
255
+ def get_recursive_content_from_database(
256
+ client: Client,
257
+ database_id: str,
258
+ logger: logging.Logger,
259
+ ) -> ChildExtractionResponse:
260
+ return get_recursive_content(
261
+ client=client,
262
+ init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
263
+ logger=logger,
264
+ )
265
+
266
+
267
+ def get_recursive_content(
268
+ client: Client,
269
+ init_entry: QueueEntry,
270
+ logger: logging.Logger,
271
+ ) -> ChildExtractionResponse:
272
+ parents: List[QueueEntry] = [init_entry]
273
+ child_pages: List[str] = []
274
+ child_dbs: List[str] = []
275
+ processed: List[str] = []
276
+ while len(parents) > 0:
277
+ parent: QueueEntry = parents.pop()
278
+ processed.append(str(parent.id))
279
+ if parent.type == QueueEntryType.PAGE:
280
+ logger.debug(f"getting child data from page: {parent.id}")
281
+ page_children = []
282
+ try:
283
+ for children_block in client.blocks.children.iterate_list( # type: ignore
284
+ block_id=str(parent.id),
285
+ ):
286
+ page_children.extend(children_block)
287
+ except APIResponseError as api_error:
288
+ logger.error(f"failed to get page with id {parent.id}: {api_error}")
289
+ if str(parent.id) in child_pages:
290
+ child_pages.remove(str(parent.id))
291
+ continue
292
+ if not page_children:
293
+ continue
294
+
295
+ # Extract child pages
296
+ child_pages_from_page = [
297
+ c for c in page_children if isinstance(c.block, notion_blocks.ChildPage)
298
+ ]
299
+ if child_pages_from_page:
300
+ child_page_blocks: List[notion_blocks.ChildPage] = [
301
+ p.block
302
+ for p in child_pages_from_page
303
+ if isinstance(p.block, notion_blocks.ChildPage)
304
+ ]
305
+ logger.debug(
306
+ "found child pages from parent page {}: {}".format(
307
+ parent.id,
308
+ ", ".join([block.title for block in child_page_blocks]),
309
+ ),
310
+ )
311
+ new_pages = [p.id for p in child_pages_from_page if p.id not in processed]
312
+ new_pages = list(set(new_pages))
313
+ child_pages.extend(new_pages)
314
+ parents.extend(
315
+ [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
316
+ )
317
+
318
+ # Extract child databases
319
+ child_dbs_from_page = [
320
+ c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase)
321
+ ]
322
+ if child_dbs_from_page:
323
+ child_db_blocks: List[notion_blocks.ChildDatabase] = [
324
+ c.block
325
+ for c in page_children
326
+ if isinstance(c.block, notion_blocks.ChildDatabase)
327
+ ]
328
+ logger.debug(
329
+ "found child database from parent page {}: {}".format(
330
+ parent.id,
331
+ ", ".join([block.title for block in child_db_blocks]),
332
+ ),
333
+ )
334
+ new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed]
335
+ new_dbs = list(set(new_dbs))
336
+ child_dbs.extend(new_dbs)
337
+ parents.extend(
338
+ [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
339
+ )
340
+
341
+ linked_to_others: List[notion_blocks.LinkToPage] = [
342
+ c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage)
343
+ ]
344
+ for link in linked_to_others:
345
+ if (page_id := link.page_id) and (
346
+ page_id not in processed and page_id not in child_pages
347
+ ):
348
+ child_pages.append(page_id)
349
+ parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)))
350
+ if (database_id := link.database_id) and (
351
+ database_id not in processed and database_id not in child_dbs
352
+ ):
353
+ child_dbs.append(database_id)
354
+ parents.append(
355
+ QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)),
356
+ )
357
+
358
+ elif parent.type == QueueEntryType.DATABASE:
359
+ logger.debug(f"getting child data from database: {parent.id}")
360
+ database_pages = []
361
+ try:
362
+ for page_entries in client.databases.iterate_query( # type: ignore
363
+ database_id=str(parent.id),
364
+ ):
365
+ database_pages.extend(page_entries)
366
+ except APIResponseError as api_error:
367
+ logger.error(f"failed to get database with id {parent.id}: {api_error}")
368
+ if str(parent.id) in child_dbs:
369
+ child_dbs.remove(str(parent.id))
370
+ continue
371
+ if not database_pages:
372
+ continue
373
+
374
+ child_pages_from_db = [
375
+ p for p in database_pages if is_page_url(client=client, url=p.url)
376
+ ]
377
+ if child_pages_from_db:
378
+ logger.debug(
379
+ "found child pages from parent database {}: {}".format(
380
+ parent.id,
381
+ ", ".join([p.url for p in child_pages_from_db]),
382
+ ),
383
+ )
384
+ new_pages = [p.id for p in child_pages_from_db if p.id not in processed]
385
+ child_pages.extend(new_pages)
386
+ parents.extend(
387
+ [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages],
388
+ )
389
+
390
+ child_dbs_from_db = [
391
+ p for p in database_pages if is_database_url(client=client, url=p.url)
392
+ ]
393
+ if child_dbs_from_db:
394
+ logger.debug(
395
+ "found child database from parent database {}: {}".format(
396
+ parent.id,
397
+ ", ".join([db.url for db in child_dbs_from_db]),
398
+ ),
399
+ )
400
+ new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed]
401
+ child_dbs.extend(new_dbs)
402
+ parents.extend(
403
+ [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs],
404
+ )
405
+
406
+ return ChildExtractionResponse(
407
+ child_pages=child_pages,
408
+ child_databases=child_dbs,
409
+ )
410
+
411
+
412
+ def is_valid_uuid(uuid_str: str) -> bool:
413
+ try:
414
+ UUID(uuid_str)
415
+ return True
416
+ except Exception:
417
+ return False
418
+
419
+
420
+ def get_uuid_from_url(path: str) -> Optional[str]:
421
+ strings = path.split("-")
422
+ if len(strings) > 0 and is_valid_uuid(strings[-1]):
423
+ return strings[-1]
424
+ return None
425
+
426
+
427
+ def is_page_url(client: Client, url: str):
428
+ parsed_url = urlparse(url)
429
+ path = parsed_url.path.split("/")[-1]
430
+ if parsed_url.netloc != "www.notion.so":
431
+ return False
432
+ page_uuid = get_uuid_from_url(path=path)
433
+ if not page_uuid:
434
+ return False
435
+ check_resp = client.pages.retrieve_status(page_id=page_uuid)
436
+ return check_resp == 200
437
+
438
+
439
+ def is_database_url(client: Client, url: str):
440
+ parsed_url = urlparse(url)
441
+ path = parsed_url.path.split("/")[-1]
442
+ if parsed_url.netloc != "www.notion.so":
443
+ return False
444
+ database_uuid = get_uuid_from_url(path=path)
445
+ if not database_uuid:
446
+ return False
447
+ check_resp = client.databases.retrieve_status(database_id=database_uuid)
448
+ return check_resp == 200
@@ -0,0 +1,32 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.tags import HtmlTag
5
+
6
+
7
+ class FromJSONMixin(ABC):
8
+ @classmethod
9
+ @abstractmethod
10
+ def from_dict(cls, data: dict):
11
+ pass
12
+
13
+
14
+ class GetHTMLMixin(ABC):
15
+ @abstractmethod
16
+ def get_html(self) -> Optional[HtmlTag]:
17
+ pass
18
+
19
+
20
+ class BlockBase(FromJSONMixin, GetHTMLMixin):
21
+ @staticmethod
22
+ @abstractmethod
23
+ def can_have_children() -> bool:
24
+ pass
25
+
26
+
27
+ class DBPropertyBase(FromJSONMixin):
28
+ pass
29
+
30
+
31
+ class DBCellBase(FromJSONMixin, GetHTMLMixin):
32
+ pass
@@ -0,0 +1,96 @@
1
+ # https://developers.notion.com/reference/page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.v2.processes.connectors.notion.interfaces import (
8
+ BlockBase,
9
+ FromJSONMixin,
10
+ GetHTMLMixin,
11
+ )
12
+ from unstructured_ingest.v2.processes.connectors.notion.types import blocks
13
+ from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent
14
+ from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser
15
+
16
+ block_type_mapping = {
17
+ "bookmark": blocks.Bookmark,
18
+ "breadcrumb": blocks.Breadcrumb,
19
+ "bulleted_list_item": blocks.BulletedListItem,
20
+ "callout": blocks.Callout,
21
+ "child_database": blocks.ChildDatabase,
22
+ "child_page": blocks.ChildPage,
23
+ "code": blocks.Code,
24
+ "column": blocks.Column,
25
+ "column_list": blocks.ColumnList,
26
+ "divider": blocks.Divider,
27
+ "heading_1": blocks.Heading,
28
+ "heading_2": blocks.Heading,
29
+ "heading_3": blocks.Heading,
30
+ "embed": blocks.Embed,
31
+ "equation": blocks.Equation,
32
+ "file": blocks.File,
33
+ "image": blocks.Image,
34
+ "link_preview": blocks.LinkPreview,
35
+ "link_to_page": blocks.LinkToPage,
36
+ "numbered_list_item": blocks.NumberedListItem,
37
+ "paragraph": blocks.Paragraph,
38
+ "pdf": blocks.PDF,
39
+ "quote": blocks.Quote,
40
+ "synced_block": blocks.SyncBlock,
41
+ "table": blocks.Table,
42
+ "table_of_contents": blocks.TableOfContents,
43
+ "table_row": blocks.TableRow,
44
+ "template": blocks.Template,
45
+ "to_do": blocks.ToDo,
46
+ "toggle": blocks.Toggle,
47
+ "unsupported": blocks.Unsupported,
48
+ "video": blocks.Video,
49
+ }
50
+
51
+
52
+ @dataclass
53
+ class Block(FromJSONMixin, GetHTMLMixin):
54
+ id: str
55
+ type: str
56
+ created_time: str
57
+ created_by: PartialUser
58
+ last_edited_time: str
59
+ last_edited_by: PartialUser
60
+ archived: bool
61
+ in_trash: bool
62
+ has_children: bool
63
+ parent: Parent
64
+ block: BlockBase
65
+ object: str = "block"
66
+ request_id: Optional[str] = None
67
+
68
+ def __repr__(self):
69
+ return f"{self.__class__.__name__}(id={self.id}, type={self.type})"
70
+
71
+ @classmethod
72
+ def from_dict(cls, data: dict):
73
+ t = data["type"]
74
+ block_data = data.pop(t)
75
+ created_by = data.pop("created_by")
76
+ last_edited_by = data.pop("last_edited_by")
77
+ parent = data.pop("parent")
78
+ try:
79
+ block = cls(
80
+ created_by=PartialUser.from_dict(created_by),
81
+ last_edited_by=PartialUser.from_dict(last_edited_by),
82
+ parent=Parent.from_dict(parent),
83
+ block=block_type_mapping[t].from_dict(block_data), # type: ignore
84
+ **data,
85
+ )
86
+ except KeyError as ke:
87
+ raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke
88
+ except TypeError as te:
89
+ raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te
90
+
91
+ return block
92
+
93
+ def get_html(self) -> Optional[HtmlTag]:
94
+ if self.block:
95
+ return self.block.get_html()
96
+ return None
@@ -0,0 +1,63 @@
1
+ from .bookmark import Bookmark
2
+ from .breadcrumb import Breadcrumb
3
+ from .bulleted_list_item import BulletedListItem
4
+ from .callout import Callout
5
+ from .child_database import ChildDatabase
6
+ from .child_page import ChildPage
7
+ from .code import Code
8
+ from .column_list import Column, ColumnList
9
+ from .divider import Divider
10
+ from .embed import Embed
11
+ from .equation import Equation
12
+ from .file import File
13
+ from .heading import Heading
14
+ from .image import Image
15
+ from .link_preview import LinkPreview
16
+ from .link_to_page import LinkToPage
17
+ from .numbered_list import NumberedListItem
18
+ from .paragraph import Paragraph
19
+ from .pdf import PDF
20
+ from .quote import Quote
21
+ from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock
22
+ from .table import Table, TableRow
23
+ from .table_of_contents import TableOfContents
24
+ from .template import Template
25
+ from .todo import ToDo
26
+ from .toggle import Toggle
27
+ from .unsupported import Unsupported
28
+ from .video import Video
29
+
30
+ __all__ = [
31
+ "Bookmark",
32
+ "Breadcrumb",
33
+ "BulletedListItem",
34
+ "Callout",
35
+ "ChildDatabase",
36
+ "ChildPage",
37
+ "Code",
38
+ "Column",
39
+ "ColumnList",
40
+ "Divider",
41
+ "Embed",
42
+ "Equation",
43
+ "File",
44
+ "Heading",
45
+ "Image",
46
+ "LinkPreview",
47
+ "LinkToPage",
48
+ "NumberedListItem",
49
+ "Paragraph",
50
+ "PDF",
51
+ "Quote",
52
+ "SyncBlock",
53
+ "OriginalSyncedBlock",
54
+ "DuplicateSyncedBlock",
55
+ "Table",
56
+ "TableRow",
57
+ "TableOfContents",
58
+ "Template",
59
+ "ToDo",
60
+ "Toggle",
61
+ "Unsupported",
62
+ "Video",
63
+ ]
@@ -0,0 +1,40 @@
1
+ # https://developers.notion.com/reference/block#bookmark
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Bookmark(BlockBase):
14
+ url: str
15
+ caption: List[RichText] = field(default_factory=list)
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ captions = data.pop("caption", [])
20
+ return cls(
21
+ url=data["url"],
22
+ caption=[RichText.from_dict(c) for c in captions],
23
+ )
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ texts = []
27
+ if self.url:
28
+ texts.append(A([Href(self.url)], self.url))
29
+ if self.caption:
30
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
31
+ if not texts:
32
+ return None
33
+ joined = [Br()] * (len(texts) * 2 - 1)
34
+ joined[0::2] = texts
35
+
36
+ return Div([], joined)
37
+
38
+ @staticmethod
39
+ def can_have_children() -> bool:
40
+ return False
@@ -0,0 +1,21 @@
1
+ # https://developers.notion.com/reference/block#breadcrumb
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class Breadcrumb(BlockBase):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return False
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict):
18
+ return cls()
19
+
20
+ def get_html(self) -> Optional[HtmlTag]:
21
+ pass