unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,24 @@
1
+ # https://developers.notion.com/reference/block#link-preview
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+
10
+
11
+ @dataclass
12
+ class LinkPreview(BlockBase):
13
+ url: str
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return False
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ return cls(**data)
22
+
23
+ def get_html(self) -> Optional[HtmlTag]:
24
+ return A([Href(self.url)], self.url)
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#link-to-page
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class LinkToPage(BlockBase):
12
+ type: str
13
+ page_id: Optional[str] = None
14
+ database_id: Optional[str] = None
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return False
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ return cls(**data)
23
+
24
+ def get_html(self) -> Optional[HtmlTag]:
25
+ if page_id := self.page_id:
26
+ return Div([], page_id)
27
+ if database_id := self.database_id:
28
+ return Div([], database_id)
29
+ return None
@@ -0,0 +1,29 @@
1
+ # https://developers.notion.com/reference/block#numbered-list-item
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Li
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class NumberedListItem(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ numbered_list = cls(**data)
25
+ numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return numbered_list
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ return Li([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,31 @@
1
+ # https://developers.notion.com/reference/block#paragraph
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Br, Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Paragraph(BlockBase):
13
+ color: str
14
+ children: List[dict] = field(default_factory=list)
15
+ rich_text: List[RichText] = field(default_factory=list)
16
+
17
+ @staticmethod
18
+ def can_have_children() -> bool:
19
+ return True
20
+
21
+ @classmethod
22
+ def from_dict(cls, data: dict):
23
+ rich_text = data.pop("rich_text", [])
24
+ paragraph = cls(**data)
25
+ paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text]
26
+ return paragraph
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ if not self.rich_text:
30
+ return Br()
31
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,49 @@
1
+ # https://developers.notion.com/reference/block#pdf
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Href
6
+ from htmlBuilder.tags import A, Br, Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.file import External, File
10
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
11
+
12
+
13
+ @dataclass
14
+ class PDF(BlockBase):
15
+ type: str
16
+ caption: List[RichText] = field(default_factory=list)
17
+ external: Optional[External] = None
18
+ file: Optional[File] = None
19
+
20
+ @staticmethod
21
+ def can_have_children() -> bool:
22
+ return False
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict):
26
+ caption = data.pop("caption", [])
27
+ t = data["type"]
28
+ paragraph = cls(type=t)
29
+ paragraph.caption = [RichText.from_dict(c) for c in caption]
30
+ if t == "external":
31
+ paragraph.external = External.from_dict(data["external"])
32
+ elif t == "file":
33
+ paragraph.file = File.from_dict(data["file"])
34
+ return paragraph
35
+
36
+ def get_html(self) -> Optional[HtmlTag]:
37
+ texts = []
38
+ if self.external:
39
+ texts.append(A([Href(self.external.url)], self.external.url))
40
+ if self.file:
41
+ texts.append(A([Href(self.file.url)], self.file.url))
42
+ if self.caption:
43
+ texts.append(Div([], [rt.get_html() for rt in self.caption]))
44
+ if not texts:
45
+ return None
46
+ joined = [Br()] * (len(texts) * 2 - 1)
47
+ joined[0::2] = texts
48
+
49
+ return Div([], joined)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#quote
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Quote(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ quote = cls(**data)
26
+ quote.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return quote
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,57 @@
1
+ # https://developers.notion.com/reference/block#synced-block
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class OriginalSyncedBlock(BlockBase):
12
+ synced_from: Optional[str] = None
13
+ children: List[dict] = field(default_factory=list)
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return True
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ return cls(children=data["children"])
22
+
23
+ def get_html(self) -> Optional[HtmlTag]:
24
+ return None
25
+
26
+
27
+ @dataclass
28
+ class DuplicateSyncedBlock(BlockBase):
29
+ type: str
30
+ block_id: str
31
+
32
+ @staticmethod
33
+ def can_have_children() -> bool:
34
+ return True
35
+
36
+ @classmethod
37
+ def from_dict(cls, data: dict):
38
+ return cls(**data)
39
+
40
+ def get_html(self) -> Optional[HtmlTag]:
41
+ return None
42
+
43
+
44
+ class SyncBlock(BlockBase):
45
+ @staticmethod
46
+ def can_have_children() -> bool:
47
+ return True
48
+
49
+ @classmethod
50
+ def from_dict(cls, data: dict):
51
+ if "synced_from" in data:
52
+ return OriginalSyncedBlock.from_dict(data)
53
+ else:
54
+ return DuplicateSyncedBlock.from_dict(data)
55
+
56
+ def get_html(self) -> Optional[HtmlTag]:
57
+ return None
@@ -0,0 +1,63 @@
1
+ # https://developers.notion.com/reference/block#table
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Td, Th, Tr
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import (
8
+ BlockBase,
9
+ FromJSONMixin,
10
+ )
11
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
12
+
13
+
14
+ @dataclass
15
+ class Table(BlockBase):
16
+ table_width: int
17
+ has_column_header: bool
18
+ has_row_header: bool
19
+
20
+ @staticmethod
21
+ def can_have_children() -> bool:
22
+ return True
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict):
26
+ return cls(**data)
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ return None
30
+
31
+
32
+ @dataclass
33
+ class TableCell(FromJSONMixin):
34
+ rich_texts: List[RichText]
35
+
36
+ @classmethod
37
+ def from_dict(cls, data: dict):
38
+ return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])])
39
+
40
+ def get_html(self, is_header: bool) -> Optional[HtmlTag]:
41
+ if is_header:
42
+ return Th([], [rt.get_html() for rt in self.rich_texts])
43
+ else:
44
+ return Td([], [rt.get_html() for rt in self.rich_texts])
45
+
46
+
47
+ # https://developers.notion.com/reference/block#table-rows
48
+ @dataclass
49
+ class TableRow(BlockBase):
50
+ is_header: bool = False
51
+ cells: List[TableCell] = field(default_factory=list)
52
+
53
+ @classmethod
54
+ def from_dict(cls, data: dict):
55
+ cells = data.get("cells", [])
56
+ return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells])
57
+
58
+ @staticmethod
59
+ def can_have_children() -> bool:
60
+ return False
61
+
62
+ def get_html(self) -> Optional[HtmlTag]:
63
+ return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells])
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#table-of-contents
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class TableOfContents(BlockBase):
12
+ color: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return False
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return None
@@ -0,0 +1,30 @@
1
+ # https://developers.notion.com/reference/block#template
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Template(BlockBase):
13
+ children: List[dict] = field(default_factory=list)
14
+ rich_text: List[RichText] = field(default_factory=list)
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return True
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ rich_text = data.pop("rich_text", [])
23
+ template = cls(**data)
24
+ template.rich_text = [RichText.from_dict(rt) for rt in rich_text]
25
+ return template
26
+
27
+ def get_html(self) -> Optional[HtmlTag]:
28
+ if not self.rich_text:
29
+ return None
30
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,42 @@
1
+ # https://developers.notion.com/reference/block#to-do
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Checked, Style, Type
6
+ from htmlBuilder.tags import Div, HtmlTag, Input
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class ToDo(BlockBase):
14
+ color: str
15
+ checked: bool = False
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ todo = cls(**data)
26
+ todo.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return todo
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ elements = []
34
+ check_input_attributes = [Type("checkbox")]
35
+ if self.checked:
36
+ check_input_attributes.append(Checked(""))
37
+ elements.append(Input(check_input_attributes))
38
+ elements.extend([rt.get_html() for rt in self.rich_text])
39
+ attributes = []
40
+ if self.color and self.color != "default":
41
+ attributes.append(Style(f"color: {self.color}"))
42
+ return Div(attributes, elements)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#toggle-blocks
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Toggle(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ toggle = cls(**data)
26
+ toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return toggle
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.tags import HtmlTag
5
+
6
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
7
+
8
+
9
+ @dataclass
10
+ class Unsupported(BlockBase):
11
+ @staticmethod
12
+ def can_have_children() -> bool:
13
+ return False
14
+
15
+ @classmethod
16
+ def from_dict(cls, data: dict):
17
+ return cls()
18
+
19
+ def get_html(self) -> Optional[HtmlTag]:
20
+ return None
@@ -0,0 +1,22 @@
1
+ # https://developers.notion.com/reference/block#image
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.attributes import Src
5
+ from htmlBuilder.tags import HtmlTag, Source
6
+ from htmlBuilder.tags import Video as VideoHtml
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.file import FileObject
10
+
11
+
12
+ class Video(BlockBase, FileObject):
13
+ @staticmethod
14
+ def can_have_children() -> bool:
15
+ return False
16
+
17
+ def get_html(self) -> Optional[HtmlTag]:
18
+ if self.external:
19
+ return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])])
20
+ if self.file:
21
+ return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])])
22
+ return None
@@ -0,0 +1,73 @@
1
+ # https://developers.notion.com/reference/database
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, List, Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag, Span
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import (
8
+ DBPropertyBase,
9
+ FromJSONMixin,
10
+ GetHTMLMixin,
11
+ )
12
+ from unstructured_ingest.connector.notion.types.database_properties import (
13
+ map_properties,
14
+ )
15
+ from unstructured_ingest.connector.notion.types.file import FileObject
16
+ from unstructured_ingest.connector.notion.types.parent import Parent
17
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
18
+ from unstructured_ingest.connector.notion.types.user import PartialUser
19
+
20
+
21
+ @dataclass
22
+ class Database(FromJSONMixin, GetHTMLMixin):
23
+ id: str
24
+ created_time: str
25
+ created_by: PartialUser
26
+ last_edited_time: str
27
+ last_edited_by: PartialUser
28
+ archived: bool
29
+ in_trash: bool
30
+ parent: Parent
31
+ url: str
32
+ is_inline: bool
33
+ public_url: str
34
+ request_id: Optional[str] = None
35
+ properties: Dict[str, DBPropertyBase] = field(default_factory=dict)
36
+ title: List[RichText] = field(default_factory=list)
37
+ description: List[RichText] = field(default_factory=list)
38
+ icon: Optional[FileObject] = None
39
+ cover: Optional[FileObject] = None
40
+ object: str = "database"
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: dict):
44
+ created_by = data.pop("created_by")
45
+ last_edited_by = data.pop("last_edited_by")
46
+ icon = data.pop("icon")
47
+ cover = data.pop("cover")
48
+ parent = data.pop("parent")
49
+ title = data.pop("title")
50
+ description = data.pop("description")
51
+ page = cls(
52
+ properties=map_properties(data.pop("properties", {})),
53
+ created_by=PartialUser.from_dict(created_by),
54
+ last_edited_by=PartialUser.from_dict(last_edited_by),
55
+ icon=FileObject.from_dict(icon) if icon else None,
56
+ cover=FileObject.from_dict(cover) if cover else None,
57
+ parent=Parent.from_dict(parent),
58
+ title=[RichText.from_dict(data=r) for r in title],
59
+ description=[RichText.from_dict(data=r) for r in description],
60
+ **data,
61
+ )
62
+
63
+ return page
64
+
65
+ def get_html(self) -> Optional[HtmlTag]:
66
+ spans = []
67
+ if title := self.title:
68
+ spans.append(Span([], [rt.get_html() for rt in title]))
69
+ if description := self.description:
70
+ spans.append(Span([], [rt.get_html() for rt in description]))
71
+ if spans:
72
+ return Div([], spans)
73
+ return None
@@ -0,0 +1,106 @@
1
+ from typing import Dict
2
+
3
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
4
+
5
+ from .checkbox import Checkbox, CheckboxCell
6
+ from .created_by import CreatedBy, CreatedByCell
7
+ from .created_time import CreatedTime, CreatedTimeCell
8
+ from .date import Date, DateCell
9
+ from .email import Email, EmailCell
10
+ from .files import Files, FilesCell
11
+ from .formula import Formula, FormulaCell
12
+ from .last_edited_by import LastEditedBy, LastEditedByCell
13
+ from .last_edited_time import LastEditedTime, LastEditedTimeCell
14
+ from .multiselect import MultiSelect, MultiSelectCell
15
+ from .number import Number, NumberCell
16
+ from .people import People, PeopleCell
17
+ from .phone_number import PhoneNumber, PhoneNumberCell
18
+ from .relation import Relation, RelationCell
19
+ from .rich_text import RichText, RichTextCell
20
+ from .rollup import Rollup, RollupCell
21
+ from .select import Select, SelectCell
22
+ from .status import Status, StatusCell
23
+ from .title import Title, TitleCell
24
+ from .unique_id import UniqueID, UniqueIDCell
25
+ from .url import URL, URLCell
26
+ from .verification import Verification, VerificationCell
27
+
28
+ db_prop_type_mapping = {
29
+ "checkbox": Checkbox,
30
+ "created_by": CreatedBy,
31
+ "created_time": CreatedTime,
32
+ "date": Date,
33
+ "email": Email,
34
+ "files": Files,
35
+ "formula": Formula,
36
+ "last_edited_by": LastEditedBy,
37
+ "last_edited_time": LastEditedTime,
38
+ "multi_select": MultiSelect,
39
+ "number": Number,
40
+ "people": People,
41
+ "phone_number": PhoneNumber,
42
+ "relation": Relation,
43
+ "rich_text": RichText,
44
+ "rollup": Rollup,
45
+ "select": Select,
46
+ "status": Status,
47
+ "title": Title,
48
+ "unique_id": UniqueID,
49
+ "url": URL,
50
+ "verification": Verification,
51
+ }
52
+
53
+
54
+ def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]:
55
+ mapped_dict = {}
56
+ for k, v in props.items():
57
+ try:
58
+ mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore
59
+ except KeyError as ke:
60
+ raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke
61
+
62
+ return mapped_dict
63
+
64
+
65
+ db_cell_type_mapping = {
66
+ "checkbox": CheckboxCell,
67
+ "created_by": CreatedByCell,
68
+ "created_time": CreatedTimeCell,
69
+ "date": DateCell,
70
+ "email": EmailCell,
71
+ "files": FilesCell,
72
+ "formula": FormulaCell,
73
+ "last_edited_by": LastEditedByCell,
74
+ "last_edited_time": LastEditedTimeCell,
75
+ "multi_select": MultiSelectCell,
76
+ "number": NumberCell,
77
+ "people": PeopleCell,
78
+ "phone_number": PhoneNumberCell,
79
+ "relation": RelationCell,
80
+ "rich_text": RichTextCell,
81
+ "rollup": RollupCell,
82
+ "select": SelectCell,
83
+ "status": StatusCell,
84
+ "title": TitleCell,
85
+ "unique_id": UniqueIDCell,
86
+ "url": URLCell,
87
+ "verification": VerificationCell,
88
+ }
89
+
90
+
91
+ def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]:
92
+ mapped_dict = {}
93
+ for k, v in props.items():
94
+ try:
95
+ t = v["type"]
96
+ mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore
97
+ except KeyError as ke:
98
+ raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke
99
+
100
+ return mapped_dict
101
+
102
+
103
+ __all__ = [
104
+ "map_properties",
105
+ "map_cells",
106
+ ]