unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,31 @@
1
+ # https://developers.notion.com/reference/block#bulleted-list-item
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Li
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class BulletedListItem(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ return cls(
25
+ color=data["color"],
26
+ children=data.get("children", []),
27
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
28
+ )
29
+
30
+ def get_html(self) -> Optional[HtmlTag]:
31
+ return Li([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,94 @@
1
+ # https://developers.notion.com/reference/block#callout
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional, Union
4
+
5
+ from htmlBuilder.attributes import Href, Style
6
+ from htmlBuilder.tags import A, Div, HtmlTag, P
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import (
9
+ BlockBase,
10
+ FromJSONMixin,
11
+ GetHTMLMixin,
12
+ )
13
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
14
+
15
+
16
+ @dataclass
17
+ class EmojiIcon(FromJSONMixin, GetHTMLMixin):
18
+ emoji: str
19
+ type: str = "emoji"
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ return cls(**data)
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ return P([], self.emoji)
27
+
28
+
29
+ @dataclass
30
+ class ExternalIconContent(FromJSONMixin):
31
+ url: str
32
+
33
+ @classmethod
34
+ def from_dict(cls, data: dict):
35
+ return cls(**data)
36
+
37
+
38
+ @dataclass
39
+ class ExternalIcon(FromJSONMixin, GetHTMLMixin):
40
+ external: ExternalIconContent
41
+ type: str = "external"
42
+
43
+ @classmethod
44
+ def from_dict(cls, data: dict):
45
+ return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data)
46
+
47
+ def get_html(self) -> Optional[HtmlTag]:
48
+ if self.external:
49
+ return A([Href(self.external.url)], [self.external.url])
50
+ else:
51
+ return None
52
+
53
+
54
+ class Icon(FromJSONMixin):
55
+ @classmethod
56
+ def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]:
57
+ t = data.get("type")
58
+ if t == "emoji":
59
+ return EmojiIcon.from_dict(data)
60
+ elif t == "external":
61
+ return ExternalIcon.from_dict(data)
62
+ else:
63
+ raise ValueError(f"Unexpected icon type: {t} ({data})")
64
+
65
+
66
+ @dataclass
67
+ class Callout(BlockBase):
68
+ color: str
69
+ icon: Optional[Union[EmojiIcon, ExternalIcon]] = None
70
+ rich_text: List[RichText] = field(default_factory=list)
71
+
72
+ @staticmethod
73
+ def can_have_children() -> bool:
74
+ return True
75
+
76
+ @classmethod
77
+ def from_dict(cls, data: dict):
78
+ rich_text = data.pop("rich_text", [])
79
+ return cls(
80
+ color=data["color"],
81
+ icon=Icon.from_dict(data.pop("icon")),
82
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
83
+ )
84
+
85
+ def get_html(self) -> Optional[HtmlTag]:
86
+ elements = []
87
+ if self.icon and self.icon.get_html():
88
+ elements.append(self.icon.get_html())
89
+ if self.rich_text:
90
+ elements.extend([rt.get_html() for rt in self.rich_text])
91
+ attributes = []
92
+ if self.color:
93
+ attributes.append(Style(f"color:{self.color}"))
94
+ return Div(attributes, elements)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#child-database
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, P
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class ChildDatabase(BlockBase):
12
+ title: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return True
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return P([], self.title)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#child-page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, P
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase, GetHTMLMixin
8
+
9
+
10
+ @dataclass
11
+ class ChildPage(BlockBase, GetHTMLMixin):
12
+ title: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return True
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return P([], self.title)
@@ -0,0 +1,43 @@
1
+ # https://developers.notion.com/reference/block#code
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Br, Div, HtmlTag
6
+ from htmlBuilder.tags import Code as HtmlCode
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Code(BlockBase):
14
+ language: str
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+ caption: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return False
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ caption = data.pop("caption", [])
26
+ return cls(
27
+ language=data["language"],
28
+ rich_text=[RichText.from_dict(rt) for rt in rich_text],
29
+ caption=[RichText.from_dict(c) for c in caption],
30
+ )
31
+
32
+ def get_html(self) -> Optional[HtmlTag]:
33
+ texts = []
34
+ if self.rich_text:
35
+ texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text]))
36
+ if self.caption:
37
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
38
+ if not texts:
39
+ return None
40
+ joined = [Br()] * (len(texts) * 2 - 1)
41
+ joined[0::2] = texts
42
+
43
+ return Div([], joined)
@@ -0,0 +1,35 @@
1
+ # https://developers.notion.com/reference/block#column-list-and-column
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class ColumnList(BlockBase):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return True
15
+
16
+ @classmethod
17
+ def from_dict(cls, data: dict):
18
+ return cls()
19
+
20
+ def get_html(self) -> Optional[HtmlTag]:
21
+ return None
22
+
23
+
24
+ @dataclass
25
+ class Column(BlockBase):
26
+ @staticmethod
27
+ def can_have_children() -> bool:
28
+ return True
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict):
32
+ return cls()
33
+
34
+ def get_html(self) -> Optional[HtmlTag]:
35
+ return None
@@ -0,0 +1,22 @@
1
+ # https://developers.notion.com/reference/block#divider
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Hr, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+
10
+
11
+ @dataclass
12
+ class Divider(BlockBase):
13
+ @staticmethod
14
+ def can_have_children() -> bool:
15
+ return False
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ return cls()
20
+
21
+ def get_html(self) -> Optional[HtmlTag]:
22
+ return Hr([Style("border-top: 3px solid #bbb")])
@@ -0,0 +1,36 @@
1
+ # https://developers.notion.com/reference/block#embed
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Embed(BlockBase):
14
+ url: str
15
+ caption: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return False
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data)
24
+
25
+ def get_html(self) -> Optional[HtmlTag]:
26
+ texts = []
27
+ if self.url:
28
+ texts.append(A([Href(self.url)], self.url))
29
+ if self.caption:
30
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
31
+ if not texts:
32
+ return None
33
+ joined = [Br()] * (len(texts) * 2 - 1)
34
+ joined[0::2] = texts
35
+
36
+ return Div([], joined)
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#equation
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class Equation(BlockBase):
12
+ expression: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return False
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return Div([], self.expression)
@@ -0,0 +1,49 @@
1
+ # https://developers.notion.com/reference/block#file
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.file import External
10
+ from unstructured_ingest.connector.notion.types.file import File as FileContent
11
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
12
+
13
+
14
+ @dataclass
15
+ class File(BlockBase):
16
+ type: str
17
+ external: Optional[External] = None
18
+ file: Optional[FileContent] = None
19
+ caption: List[RichText] = field(default_factory=list)
20
+
21
+ @staticmethod
22
+ def can_have_children() -> bool:
23
+ return False
24
+
25
+ @classmethod
26
+ def from_dict(cls, data: dict):
27
+ caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])]
28
+ t = data["type"]
29
+ file = cls(type=t, caption=caption)
30
+ if t == "external":
31
+ file.external = External.from_dict(data["external"])
32
+ elif t == "file":
33
+ file.file = FileContent.from_dict(data["file"])
34
+ return file
35
+
36
+ def get_html(self) -> Optional[HtmlTag]:
37
+ texts = []
38
+ if self.file:
39
+ texts.append(A([Href(self.file.url)], self.file.url))
40
+ if self.external:
41
+ texts.append(A([Href(self.external.url)], self.external.url))
42
+ if self.caption:
43
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
44
+ if not texts:
45
+ return None
46
+ joined = [Br()] * (len(texts) * 2 - 1)
47
+ joined[0::2] = texts
48
+
49
+ return Div([], joined)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#headings
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Heading(BlockBase):
14
+ color: str
15
+ is_toggleable: bool
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return False
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ heading = cls(**data)
26
+ heading.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return heading
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,21 @@
1
+ # https://developers.notion.com/reference/block#image
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.attributes import Src
5
+ from htmlBuilder.tags import HtmlTag, Img
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.file import FileObject
9
+
10
+
11
+ class Image(BlockBase, FileObject):
12
+ @staticmethod
13
+ def can_have_children() -> bool:
14
+ return False
15
+
16
+ def get_html(self) -> Optional[HtmlTag]:
17
+ if self.external:
18
+ return Img([Src(self.external.url)], [])
19
+ if self.file:
20
+ return Img([Src(self.file.url)], [])
21
+ return None
@@ -0,0 +1,24 @@
1
+ # https://developers.notion.com/reference/block#link-preview
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+
10
+
11
+ @dataclass
12
+ class LinkPreview(BlockBase):
13
+ url: str
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return False
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ return cls(**data)
22
+
23
+ def get_html(self) -> Optional[HtmlTag]:
24
+ return A([Href(self.url)], self.url)
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#link-to-page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class LinkToPage(BlockBase):
12
+ type: str
13
+ page_id: Optional[str] = None
14
+ database_id: Optional[str] = None
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return False
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ return cls(**data)
23
+
24
+ def get_html(self) -> Optional[HtmlTag]:
25
+ if page_id := self.page_id:
26
+ return Div([], page_id)
27
+ if database_id := self.database_id:
28
+ return Div([], database_id)
29
+ return None
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#numbered-list-item
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Li
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class NumberedListItem(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ numbered_list = cls(**data)
25
+ numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return numbered_list
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ return Li([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,31 @@
1
+ # https://developers.notion.com/reference/block#paragraph
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Br, Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Paragraph(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ paragraph = cls(**data)
25
+ paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return paragraph
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ if not self.rich_text:
30
+ return Br()
31
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,49 @@
1
+ # https://developers.notion.com/reference/block#pdf
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.file import External, File
10
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
11
+
12
+
13
+ @dataclass
14
+ class PDF(BlockBase):
15
+ type: str
16
+ caption: List[RichText] = field(default_factory=list)
17
+ external: Optional[External] = None
18
+ file: Optional[File] = None
19
+
20
+ @staticmethod
21
+ def can_have_children() -> bool:
22
+ return False
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict):
26
+ caption = data.pop("caption", [])
27
+ t = data["type"]
28
+ paragraph = cls(type=t)
29
+ paragraph.caption = [RichText.from_dict(c) for c in caption]
30
+ if t == "external":
31
+ paragraph.external = External.from_dict(data["external"])
32
+ elif t == "file":
33
+ paragraph.file = File.from_dict(data["file"])
34
+ return paragraph
35
+
36
+ def get_html(self) -> Optional[HtmlTag]:
37
+ texts = []
38
+ if self.external:
39
+ texts.append(A([Href(self.external.url)], self.external.url))
40
+ if self.file:
41
+ texts.append(A([Href(self.file.url)], self.file.url))
42
+ if self.caption:
43
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
44
+ if not texts:
45
+ return None
46
+ joined = [Br()] * (len(texts) * 2 - 1)
47
+ joined[0::2] = texts
48
+
49
+ return Div([], joined)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#quote
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Quote(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ quote = cls(**data)
26
+ quote.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return quote
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)