unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,57 @@
1
+ # https://developers.notion.com/reference/block#synced-block
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class OriginalSyncedBlock(BlockBase):
12
+ synced_from: Optional[str] = None
13
+ children: List[dict] = field(default_factory=list)
14
+
15
+ @staticmethod
16
+ def can_have_children() -> bool:
17
+ return True
18
+
19
+ @classmethod
20
+ def from_dict(cls, data: dict):
21
+ return cls(children=data["children"])
22
+
23
+ def get_html(self) -> Optional[HtmlTag]:
24
+ return None
25
+
26
+
27
+ @dataclass
28
+ class DuplicateSyncedBlock(BlockBase):
29
+ type: str
30
+ block_id: str
31
+
32
+ @staticmethod
33
+ def can_have_children() -> bool:
34
+ return True
35
+
36
+ @classmethod
37
+ def from_dict(cls, data: dict):
38
+ return cls(**data)
39
+
40
+ def get_html(self) -> Optional[HtmlTag]:
41
+ return None
42
+
43
+
44
+ class SyncBlock(BlockBase):
45
+ @staticmethod
46
+ def can_have_children() -> bool:
47
+ return True
48
+
49
+ @classmethod
50
+ def from_dict(cls, data: dict):
51
+ if "synced_from" in data:
52
+ return OriginalSyncedBlock.from_dict(data)
53
+ else:
54
+ return DuplicateSyncedBlock.from_dict(data)
55
+
56
+ def get_html(self) -> Optional[HtmlTag]:
57
+ return None
@@ -0,0 +1,63 @@
1
+ # https://developers.notion.com/reference/block#table
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag, Td, Th, Tr
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import (
8
+ BlockBase,
9
+ FromJSONMixin,
10
+ )
11
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
12
+
13
+
14
+ @dataclass
15
+ class Table(BlockBase):
16
+ table_width: int
17
+ has_column_header: bool
18
+ has_row_header: bool
19
+
20
+ @staticmethod
21
+ def can_have_children() -> bool:
22
+ return True
23
+
24
+ @classmethod
25
+ def from_dict(cls, data: dict):
26
+ return cls(**data)
27
+
28
+ def get_html(self) -> Optional[HtmlTag]:
29
+ return None
30
+
31
+
32
+ @dataclass
33
+ class TableCell(FromJSONMixin):
34
+ rich_texts: List[RichText]
35
+
36
+ @classmethod
37
+ def from_dict(cls, data: dict):
38
+ return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])])
39
+
40
+ def get_html(self, is_header: bool) -> Optional[HtmlTag]:
41
+ if is_header:
42
+ return Th([], [rt.get_html() for rt in self.rich_texts])
43
+ else:
44
+ return Td([], [rt.get_html() for rt in self.rich_texts])
45
+
46
+
47
+ # https://developers.notion.com/reference/block#table-rows
48
+ @dataclass
49
+ class TableRow(BlockBase):
50
+ is_header: bool = False
51
+ cells: List[TableCell] = field(default_factory=list)
52
+
53
+ @classmethod
54
+ def from_dict(cls, data: dict):
55
+ cells = data.get("cells", [])
56
+ return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells])
57
+
58
+ @staticmethod
59
+ def can_have_children() -> bool:
60
+ return False
61
+
62
+ def get_html(self) -> Optional[HtmlTag]:
63
+ return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells])
@@ -0,0 +1,23 @@
1
+ # https://developers.notion.com/reference/block#table-of-contents
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+
9
+
10
+ @dataclass
11
+ class TableOfContents(BlockBase):
12
+ color: str
13
+
14
+ @staticmethod
15
+ def can_have_children() -> bool:
16
+ return False
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+ def get_html(self) -> Optional[HtmlTag]:
23
+ return None
@@ -0,0 +1,30 @@
1
+ # https://developers.notion.com/reference/block#template
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
8
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
9
+
10
+
11
+ @dataclass
12
+ class Template(BlockBase):
13
+ children: List[dict] = field(default_factory=list)
14
+ rich_text: List[RichText] = field(default_factory=list)
15
+
16
+ @staticmethod
17
+ def can_have_children() -> bool:
18
+ return True
19
+
20
+ @classmethod
21
+ def from_dict(cls, data: dict):
22
+ rich_text = data.pop("rich_text", [])
23
+ template = cls(**data)
24
+ template.rich_text = [RichText.from_dict(rt) for rt in rich_text]
25
+ return template
26
+
27
+ def get_html(self) -> Optional[HtmlTag]:
28
+ if not self.rich_text:
29
+ return None
30
+ return Div([], [rt.get_html() for rt in self.rich_text])
@@ -0,0 +1,42 @@
1
+ # https://developers.notion.com/reference/block#to-do
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Checked, Style, Type
6
+ from htmlBuilder.tags import Div, HtmlTag, Input
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class ToDo(BlockBase):
14
+ color: str
15
+ checked: bool = False
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ todo = cls(**data)
26
+ todo.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return todo
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ elements = []
34
+ check_input_attributes = [Type("checkbox")]
35
+ if self.checked:
36
+ check_input_attributes.append(Checked(""))
37
+ elements.append(Input(check_input_attributes))
38
+ elements.extend([rt.get_html() for rt in self.rich_text])
39
+ attributes = []
40
+ if self.color and self.color != "default":
41
+ attributes.append(Style(f"color: {self.color}"))
42
+ return Div(attributes, elements)
@@ -0,0 +1,37 @@
1
+ # https://developers.notion.com/reference/block#toggle-blocks
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional
4
+
5
+ from htmlBuilder.attributes import Style
6
+ from htmlBuilder.tags import Div, HtmlTag
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.rich_text import RichText
10
+
11
+
12
+ @dataclass
13
+ class Toggle(BlockBase):
14
+ color: str
15
+ children: List[dict] = field(default_factory=list)
16
+ rich_text: List[RichText] = field(default_factory=list)
17
+
18
+ @staticmethod
19
+ def can_have_children() -> bool:
20
+ return True
21
+
22
+ @classmethod
23
+ def from_dict(cls, data: dict):
24
+ rich_text = data.pop("rich_text", [])
25
+ toggle = cls(**data)
26
+ toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text]
27
+ return toggle
28
+
29
+ def get_html(self) -> Optional[HtmlTag]:
30
+ if not self.rich_text:
31
+ return None
32
+
33
+ texts = [rt.get_html() for rt in self.rich_text]
34
+ attributes = []
35
+ if self.color and self.color != "default":
36
+ attributes.append(Style(f"color: {self.color}"))
37
+ return Div(attributes, texts)
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.tags import HtmlTag
5
+
6
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
7
+
8
+
9
+ @dataclass
10
+ class Unsupported(BlockBase):
11
+ @staticmethod
12
+ def can_have_children() -> bool:
13
+ return False
14
+
15
+ @classmethod
16
+ def from_dict(cls, data: dict):
17
+ return cls()
18
+
19
+ def get_html(self) -> Optional[HtmlTag]:
20
+ return None
@@ -0,0 +1,22 @@
1
+ # https://developers.notion.com/reference/block#image
2
+ from typing import Optional
3
+
4
+ from htmlBuilder.attributes import Src
5
+ from htmlBuilder.tags import HtmlTag, Source
6
+ from htmlBuilder.tags import Video as VideoHtml
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import BlockBase
9
+ from unstructured_ingest.connector.notion.types.file import FileObject
10
+
11
+
12
+ class Video(BlockBase, FileObject):
13
+ @staticmethod
14
+ def can_have_children() -> bool:
15
+ return False
16
+
17
+ def get_html(self) -> Optional[HtmlTag]:
18
+ if self.external:
19
+ return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])])
20
+ if self.file:
21
+ return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])])
22
+ return None
@@ -0,0 +1,73 @@
1
+ # https://developers.notion.com/reference/database
2
+ from dataclasses import dataclass, field
3
+ from typing import Dict, List, Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag, Span
6
+
7
+ from unstructured_ingest.v2.processes.connectors.notion.interfaces import (
8
+ DBPropertyBase,
9
+ FromJSONMixin,
10
+ GetHTMLMixin,
11
+ )
12
+ from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import (
13
+ map_properties,
14
+ )
15
+ from unstructured_ingest.v2.processes.connectors.notion.types.file import FileObject
16
+ from unstructured_ingest.v2.processes.connectors.notion.types.parent import Parent
17
+ from unstructured_ingest.v2.processes.connectors.notion.types.rich_text import RichText
18
+ from unstructured_ingest.v2.processes.connectors.notion.types.user import PartialUser
19
+
20
+
21
+ @dataclass
22
+ class Database(FromJSONMixin, GetHTMLMixin):
23
+ id: str
24
+ created_time: str
25
+ created_by: PartialUser
26
+ last_edited_time: str
27
+ last_edited_by: PartialUser
28
+ archived: bool
29
+ in_trash: bool
30
+ parent: Parent
31
+ url: str
32
+ is_inline: bool
33
+ public_url: str
34
+ request_id: Optional[str] = None
35
+ properties: Dict[str, DBPropertyBase] = field(default_factory=dict)
36
+ title: List[RichText] = field(default_factory=list)
37
+ description: List[RichText] = field(default_factory=list)
38
+ icon: Optional[FileObject] = None
39
+ cover: Optional[FileObject] = None
40
+ object: str = "database"
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: dict):
44
+ created_by = data.pop("created_by")
45
+ last_edited_by = data.pop("last_edited_by")
46
+ icon = data.pop("icon")
47
+ cover = data.pop("cover")
48
+ parent = data.pop("parent")
49
+ title = data.pop("title")
50
+ description = data.pop("description")
51
+ page = cls(
52
+ properties=map_properties(data.pop("properties", {})),
53
+ created_by=PartialUser.from_dict(created_by),
54
+ last_edited_by=PartialUser.from_dict(last_edited_by),
55
+ icon=FileObject.from_dict(icon) if icon else None,
56
+ cover=FileObject.from_dict(cover) if cover else None,
57
+ parent=Parent.from_dict(parent),
58
+ title=[RichText.from_dict(data=r) for r in title],
59
+ description=[RichText.from_dict(data=r) for r in description],
60
+ **data,
61
+ )
62
+
63
+ return page
64
+
65
+ def get_html(self) -> Optional[HtmlTag]:
66
+ spans = []
67
+ if title := self.title:
68
+ spans.append(Span([], [rt.get_html() for rt in title]))
69
+ if description := self.description:
70
+ spans.append(Span([], [rt.get_html() for rt in description]))
71
+ if spans:
72
+ return Div([], spans)
73
+ return None
@@ -0,0 +1,106 @@
1
+ from typing import Dict
2
+
3
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
4
+
5
+ from .checkbox import Checkbox, CheckboxCell
6
+ from .created_by import CreatedBy, CreatedByCell
7
+ from .created_time import CreatedTime, CreatedTimeCell
8
+ from .date import Date, DateCell
9
+ from .email import Email, EmailCell
10
+ from .files import Files, FilesCell
11
+ from .formula import Formula, FormulaCell
12
+ from .last_edited_by import LastEditedBy, LastEditedByCell
13
+ from .last_edited_time import LastEditedTime, LastEditedTimeCell
14
+ from .multiselect import MultiSelect, MultiSelectCell
15
+ from .number import Number, NumberCell
16
+ from .people import People, PeopleCell
17
+ from .phone_number import PhoneNumber, PhoneNumberCell
18
+ from .relation import Relation, RelationCell
19
+ from .rich_text import RichText, RichTextCell
20
+ from .rollup import Rollup, RollupCell
21
+ from .select import Select, SelectCell
22
+ from .status import Status, StatusCell
23
+ from .title import Title, TitleCell
24
+ from .unique_id import UniqueID, UniqueIDCell
25
+ from .url import URL, URLCell
26
+ from .verification import Verification, VerificationCell
27
+
28
+ db_prop_type_mapping = {
29
+ "checkbox": Checkbox,
30
+ "created_by": CreatedBy,
31
+ "created_time": CreatedTime,
32
+ "date": Date,
33
+ "email": Email,
34
+ "files": Files,
35
+ "formula": Formula,
36
+ "last_edited_by": LastEditedBy,
37
+ "last_edited_time": LastEditedTime,
38
+ "multi_select": MultiSelect,
39
+ "number": Number,
40
+ "people": People,
41
+ "phone_number": PhoneNumber,
42
+ "relation": Relation,
43
+ "rich_text": RichText,
44
+ "rollup": Rollup,
45
+ "select": Select,
46
+ "status": Status,
47
+ "title": Title,
48
+ "unique_id": UniqueID,
49
+ "url": URL,
50
+ "verification": Verification,
51
+ }
52
+
53
+
54
+ def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]:
55
+ mapped_dict = {}
56
+ for k, v in props.items():
57
+ try:
58
+ mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore
59
+ except KeyError as ke:
60
+ raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke
61
+
62
+ return mapped_dict
63
+
64
+
65
+ db_cell_type_mapping = {
66
+ "checkbox": CheckboxCell,
67
+ "created_by": CreatedByCell,
68
+ "created_time": CreatedTimeCell,
69
+ "date": DateCell,
70
+ "email": EmailCell,
71
+ "files": FilesCell,
72
+ "formula": FormulaCell,
73
+ "last_edited_by": LastEditedByCell,
74
+ "last_edited_time": LastEditedTimeCell,
75
+ "multi_select": MultiSelectCell,
76
+ "number": NumberCell,
77
+ "people": PeopleCell,
78
+ "phone_number": PhoneNumberCell,
79
+ "relation": RelationCell,
80
+ "rich_text": RichTextCell,
81
+ "rollup": RollupCell,
82
+ "select": SelectCell,
83
+ "status": StatusCell,
84
+ "title": TitleCell,
85
+ "unique_id": UniqueIDCell,
86
+ "url": URLCell,
87
+ "verification": VerificationCell,
88
+ }
89
+
90
+
91
+ def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]:
92
+ mapped_dict = {}
93
+ for k, v in props.items():
94
+ try:
95
+ t = v["type"]
96
+ mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore
97
+ except KeyError as ke:
98
+ raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke
99
+
100
+ return mapped_dict
101
+
102
+
103
+ __all__ = [
104
+ "map_properties",
105
+ "map_cells",
106
+ ]
@@ -0,0 +1,38 @@
1
+ # https://developers.notion.com/reference/property-object#checkbox
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.attributes import Checked, Type
6
+ from htmlBuilder.tags import Div, HtmlTag, Input
7
+
8
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
9
+
10
+
11
+ @dataclass
12
+ class Checkbox(DBPropertyBase):
13
+ id: str
14
+ name: str
15
+ type: str = "checkbox"
16
+ checkbox: dict = field(default_factory=dict)
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+
23
+ @dataclass
24
+ class CheckboxCell(DBCellBase):
25
+ id: str
26
+ checkbox: bool
27
+ name: Optional[str] = None
28
+ type: str = "checkbox"
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict):
32
+ return cls(**data)
33
+
34
+ def get_html(self) -> Optional[HtmlTag]:
35
+ check_input_attributes = [Type("checkbox")]
36
+ if self.checkbox:
37
+ check_input_attributes.append(Checked(""))
38
+ return Div([], Input(check_input_attributes))
@@ -0,0 +1,35 @@
1
+ # https://developers.notion.com/reference/property-object#created-by
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
8
+ from unstructured_ingest.connector.notion.types.user import People
9
+
10
+
11
+ @dataclass
12
+ class CreatedBy(DBPropertyBase):
13
+ id: str
14
+ name: str
15
+ type: str = "created_by"
16
+ created_by: dict = field(default_factory=dict)
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+
23
+ @dataclass
24
+ class CreatedByCell(DBCellBase):
25
+ id: str
26
+ created_by: People
27
+ type: str = "created_by"
28
+ name: Optional[str] = None
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict):
32
+ return cls(created_by=People.from_dict(data.pop("created_by")), **data)
33
+
34
+ def get_html(self) -> Optional[HtmlTag]:
35
+ return self.created_by.get_html()
@@ -0,0 +1,34 @@
1
+ # https://developers.notion.com/reference/property-object#created-time
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
8
+
9
+
10
+ @dataclass
11
+ class CreatedTime(DBPropertyBase):
12
+ id: str
13
+ name: str
14
+ type: str = "created_time"
15
+ created_time: dict = field(default_factory=dict)
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ return cls(**data)
20
+
21
+
22
+ @dataclass
23
+ class CreatedTimeCell(DBCellBase):
24
+ id: str
25
+ created_time: str
26
+ type: str = "created_time"
27
+ name: Optional[str] = None
28
+
29
+ @classmethod
30
+ def from_dict(cls, data: dict):
31
+ return cls(**data)
32
+
33
+ def get_html(self) -> Optional[HtmlTag]:
34
+ return Div([], self.created_time)
@@ -0,0 +1,41 @@
1
+ # https://developers.notion.com/reference/property-object#date
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
8
+ from unstructured_ingest.connector.notion.types.date import Date as DateType
9
+
10
+
11
+ @dataclass
12
+ class Date(DBPropertyBase):
13
+ id: str
14
+ name: str
15
+ type: str = "date"
16
+ date: dict = field(default_factory=dict)
17
+
18
+ @classmethod
19
+ def from_dict(cls, data: dict):
20
+ return cls(**data)
21
+
22
+
23
+ @dataclass
24
+ class DateCell(DBCellBase):
25
+ id: str
26
+ date: Optional[DateType] = None
27
+ name: Optional[str] = None
28
+ type: str = "date"
29
+
30
+ @classmethod
31
+ def from_dict(cls, data: dict):
32
+ date = None
33
+ date_data = data.pop("date")
34
+ if date_data:
35
+ date = DateType.from_dict(date_data)
36
+ return cls(date=date, **data)
37
+
38
+ def get_html(self) -> Optional[HtmlTag]:
39
+ if date := self.date:
40
+ return date.get_html()
41
+ return None
@@ -0,0 +1,36 @@
1
+ # https://developers.notion.com/reference/property-object#email
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional
4
+
5
+ from htmlBuilder.tags import Div, HtmlTag
6
+
7
+ from unstructured_ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase
8
+
9
+
10
+ @dataclass
11
+ class Email(DBPropertyBase):
12
+ id: str
13
+ name: str
14
+ type: str = "email"
15
+ email: dict = field(default_factory=dict)
16
+
17
+ @classmethod
18
+ def from_dict(cls, data: dict):
19
+ return cls(**data)
20
+
21
+
22
+ @dataclass
23
+ class EmailCell(DBCellBase):
24
+ id: str
25
+ email: str
26
+ name: Optional[str] = None
27
+ type: str = "email"
28
+
29
+ @classmethod
30
+ def from_dict(cls, data: dict):
31
+ return cls(**data)
32
+
33
+ def get_html(self) -> Optional[HtmlTag]:
34
+ if email := self.email:
35
+ return Div([], email)
36
+ return None