unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,349 @@
1
+ from typing import Any, Generator, List, Optional, Tuple
2
+
3
+ import httpx
4
+ import notion_client.errors
5
+ from notion_client import Client as NotionClient
6
+ from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
7
+ from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint
8
+ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
9
+ from notion_client.api_endpoints import Endpoint
10
+ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint
11
+ from notion_client.errors import HTTPResponseError, RequestTimeoutError
12
+
13
+ from unstructured_ingest.ingest_backoff import RetryHandler
14
+ from unstructured_ingest.interfaces import RetryStrategyConfig
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.v2.processes.connectors.notion.types.block import Block
17
+ from unstructured_ingest.v2.processes.connectors.notion.types.database import Database
18
+ from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells
19
+ from unstructured_ingest.v2.processes.connectors.notion.types.page import Page
20
+
21
+
22
+ @requires_dependencies(["httpx"], extras="notion")
23
+ def _get_retry_strategy(
24
+ endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig
25
+ ) -> RetryHandler:
26
+ import backoff
27
+ import httpx
28
+
29
+ retryable_exceptions = (
30
+ httpx.TimeoutException,
31
+ httpx.HTTPStatusError,
32
+ notion_client.errors.HTTPResponseError,
33
+ )
34
+
35
+ return RetryHandler(
36
+ backoff.expo,
37
+ retryable_exceptions,
38
+ max_time=retry_strategy_config.max_retry_time,
39
+ max_tries=retry_strategy_config.max_retries,
40
+ logger=endpoint.parent.logger,
41
+ start_log_level=endpoint.parent.logger.level,
42
+ backoff_log_level=endpoint.parent.logger.level,
43
+ )
44
+
45
+
46
+ def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
47
+ if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
48
+ return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config)
49
+ return None
50
+
51
+
52
+ class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
53
+ def __init__(
54
+ self,
55
+ *args,
56
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
57
+ **kwargs,
58
+ ):
59
+ super().__init__(*args, **kwargs)
60
+ self.retry_strategy_config = retry_strategy_config
61
+
62
+ @property
63
+ def retry_handler(self) -> Optional[RetryHandler]:
64
+ return get_retry_handler(self)
65
+
66
+ def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]:
67
+ resp: dict = (
68
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
69
+ if self.retry_handler
70
+ else super().list(block_id=block_id, **kwargs)
71
+ ) # type: ignore
72
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
73
+ return child_blocks, resp
74
+
75
+ def iterate_list(
76
+ self,
77
+ block_id: str,
78
+ **kwargs: Any,
79
+ ) -> Generator[List[Block], None, None]:
80
+ next_cursor = None
81
+ while True:
82
+ response: dict = (
83
+ self.retry_handler(
84
+ super().list, block_id=block_id, start_cursor=next_cursor, **kwargs
85
+ )
86
+ if self.retry_handler
87
+ else super().list(block_id=block_id, start_cursor=next_cursor, **kwargs)
88
+ ) # type: ignore
89
+ child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])]
90
+ yield child_blocks
91
+
92
+ next_cursor = response.get("next_cursor")
93
+ if not response.get("has_more") or not next_cursor:
94
+ return
95
+
96
+
97
+ class DatabasesEndpoint(NotionDatabasesEndpoint):
98
+ def __init__(
99
+ self,
100
+ *args,
101
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
102
+ **kwargs,
103
+ ):
104
+ super().__init__(*args, **kwargs)
105
+ self.retry_strategy_config = retry_strategy_config
106
+
107
+ @property
108
+ def retry_handler(self) -> Optional[RetryHandler]:
109
+ return get_retry_handler(self)
110
+
111
+ def retrieve(self, database_id: str, **kwargs: Any) -> Database:
112
+ resp: dict = (
113
+ self.retry_handler(super().retrieve, database_id=database_id, **kwargs)
114
+ if (self.retry_handler)
115
+ else (super().retrieve(database_id=database_id, **kwargs))
116
+ ) # type: ignore
117
+ return Database.from_dict(data=resp)
118
+
119
+ @requires_dependencies(["httpx"], extras="notion")
120
+ def retrieve_status(self, database_id: str, **kwargs) -> int:
121
+ import httpx
122
+
123
+ request = self.parent._build_request(
124
+ method="HEAD",
125
+ path=f"databases/{database_id}",
126
+ auth=kwargs.get("auth"),
127
+ )
128
+ try:
129
+ response: httpx.Response = (
130
+ self.retry_handler(self.parent.client.send, request)
131
+ if (self.retry_handler)
132
+ else (self.parent.client.send(request))
133
+ ) # type: ignore
134
+ return response.status_code
135
+ except httpx.TimeoutException:
136
+ raise RequestTimeoutError()
137
+
138
+ def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]:
139
+ """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database.
140
+
141
+ *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)*
142
+ """ # noqa: E501
143
+ resp: dict = (
144
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
145
+ if (self.retry_handler)
146
+ else (super().query(database_id=database_id, **kwargs))
147
+ ) # type: ignore
148
+ pages = [Page.from_dict(data=p) for p in resp.pop("results")]
149
+ for p in pages:
150
+ p.properties = map_cells(p.properties)
151
+ return pages, resp
152
+
153
+ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]:
154
+ next_cursor = None
155
+ while True:
156
+ response: dict = (
157
+ self.retry_handler(
158
+ super().query, database_id=database_id, start_cursor=next_cursor, **kwargs
159
+ )
160
+ if (self.retry_handler)
161
+ else (super().query(database_id=database_id, start_cursor=next_cursor, **kwargs))
162
+ ) # type: ignore
163
+ pages = [Page.from_dict(data=p) for p in response.pop("results", [])]
164
+ for p in pages:
165
+ p.properties = map_cells(p.properties)
166
+ yield pages
167
+
168
+ next_cursor = response.get("next_cursor")
169
+ if not response.get("has_more") or not next_cursor:
170
+ return
171
+
172
+
173
+ class BlocksEndpoint(NotionBlocksEndpoint):
174
+ def __init__(
175
+ self,
176
+ *args: Any,
177
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
178
+ **kwargs: Any,
179
+ ) -> None:
180
+ super().__init__(*args, **kwargs)
181
+ self.retry_strategy_config = retry_strategy_config
182
+ self.children = BlocksChildrenEndpoint(
183
+ retry_strategy_config=retry_strategy_config,
184
+ *args,
185
+ **kwargs,
186
+ )
187
+
188
+ @property
189
+ def retry_handler(self) -> Optional[RetryHandler]:
190
+ return get_retry_handler(self)
191
+
192
+ def retrieve(self, block_id: str, **kwargs: Any) -> Block:
193
+ resp: dict = (
194
+ self.retry_handler(super().retrieve, block_id=block_id, **kwargs)
195
+ if (self.retry_handler)
196
+ else (super().retrieve(block_id=block_id, **kwargs))
197
+ ) # type: ignore
198
+ return Block.from_dict(data=resp)
199
+
200
+
201
+ class PagesEndpoint(NotionPagesEndpoint):
202
+ def __init__(
203
+ self,
204
+ *args,
205
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
206
+ **kwargs,
207
+ ):
208
+ super().__init__(*args, **kwargs)
209
+ self.retry_strategy_config = retry_strategy_config
210
+
211
+ @property
212
+ def retry_handler(self) -> Optional[RetryHandler]:
213
+ return get_retry_handler(self)
214
+
215
+ def retrieve(self, page_id: str, **kwargs: Any) -> Page:
216
+ resp: dict = (
217
+ self.retry_handler(super().retrieve, page_id=page_id, **kwargs)
218
+ if (self.retry_handler)
219
+ else (super().retrieve(page_id=page_id, **kwargs))
220
+ ) # type: ignore
221
+ return Page.from_dict(data=resp)
222
+
223
+ @requires_dependencies(["httpx"], extras="notion")
224
+ def retrieve_status(self, page_id: str, **kwargs) -> int:
225
+ import httpx
226
+
227
+ request = self.parent._build_request(
228
+ method="HEAD",
229
+ path=f"pages/{page_id}",
230
+ auth=kwargs.get("auth"),
231
+ )
232
+ try:
233
+ response: httpx.Response = (
234
+ self.retry_handler(self.parent.client.send, request)
235
+ if (self.retry_handler)
236
+ else (self.parent.client.send(request))
237
+ ) # type: ignore
238
+ return response.status_code
239
+ except httpx.TimeoutException:
240
+ raise RequestTimeoutError()
241
+
242
+
243
+ class Client(NotionClient):
244
+ def __init__(
245
+ self,
246
+ *args: Any,
247
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
248
+ **kwargs: Any,
249
+ ) -> None:
250
+ super().__init__(*args, **kwargs)
251
+ self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
252
+ self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
253
+ self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
254
+
255
+
256
+ class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
257
+ def __init__(self, *args, **kwargs):
258
+ super().__init__(*args, **kwargs)
259
+ self._http_client = httpx.AsyncClient()
260
+
261
+ async def list(self, block_id: str, **kwargs: Any) -> tuple[List[Block], dict]:
262
+ """Fetch the list of child blocks asynchronously."""
263
+ try:
264
+ response = await self._http_client.get(
265
+ f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs
266
+ )
267
+ response.raise_for_status()
268
+ except httpx.HTTPStatusError as e:
269
+ raise HTTPResponseError(f"Failed to list blocks: {str(e)}")
270
+ except httpx.TimeoutException:
271
+ raise RequestTimeoutError()
272
+
273
+ resp = response.json()
274
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
275
+ return child_blocks, resp
276
+
277
+ async def iterate_list(
278
+ self, block_id: str, **kwargs: Any
279
+ ) -> Generator[List[Block], None, None]:
280
+ """Fetch the list of child blocks in pages asynchronously."""
281
+ next_cursor = None
282
+ while True:
283
+ params = {"start_cursor": next_cursor} if next_cursor else {}
284
+ params.update(kwargs)
285
+ child_blocks, response = await self.list(block_id, **params)
286
+ yield child_blocks
287
+
288
+ next_cursor = response.get("next_cursor")
289
+ if not response.get("has_more") or not next_cursor:
290
+ return
291
+
292
+ async def close(self):
293
+ """Close the HTTP client."""
294
+ await self._http_client.aclose()
295
+
296
+
297
+ class AsyncDatabasesEndpoint(NotionDatabasesEndpoint):
298
+ def __init__(self, *args, **kwargs):
299
+ super().__init__(*args, **kwargs)
300
+ self._http_client = httpx.AsyncClient()
301
+
302
+ async def retrieve(self, database_id: str, **kwargs: Any) -> Database:
303
+ """Fetch a database by its ID asynchronously."""
304
+ try:
305
+ response = await self._http_client.get(
306
+ f"{self.parent._api_base}/databases/{database_id}", **kwargs
307
+ )
308
+ response.raise_for_status()
309
+ except httpx.HTTPStatusError as e:
310
+ raise HTTPResponseError(f"Failed to retrieve database: {str(e)}")
311
+ except httpx.TimeoutException:
312
+ raise RequestTimeoutError()
313
+
314
+ return Database.from_dict(data=response.json())
315
+
316
+ async def query(self, database_id: str, **kwargs: Any) -> tuple[List[Page], dict]:
317
+ """Query a database asynchronously."""
318
+ try:
319
+ response = await self._http_client.post(
320
+ f"{self.parent._api_base}/databases/{database_id}/query",
321
+ json=kwargs.get("json", {}),
322
+ )
323
+ response.raise_for_status()
324
+ except httpx.HTTPStatusError as e:
325
+ raise HTTPResponseError(f"Failed to query database: {str(e)}")
326
+ except httpx.TimeoutException:
327
+ raise RequestTimeoutError()
328
+
329
+ resp = response.json()
330
+ pages = [Page.from_dict(data=p) for p in resp.pop("results", [])]
331
+ for p in pages:
332
+ p.properties = map_cells(p.properties)
333
+ return pages, resp
334
+
335
+ async def close(self):
336
+ """Close the HTTP client."""
337
+ await self._http_client.aclose()
338
+
339
+
340
+ class AsyncClient(NotionClient):
341
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
342
+ super().__init__(*args, **kwargs)
343
+ self.blocks = AsyncBlocksChildrenEndpoint(parent=self)
344
+ self.databases = AsyncDatabasesEndpoint(parent=self)
345
+
346
+ async def close(self):
347
+ """Close all async endpoints."""
348
+ await self.blocks.close()
349
+ await self.databases.close()
@@ -0,0 +1,346 @@
1
+ from dataclasses import dataclass
2
+ from time import time
3
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
4
+
5
+ from pydantic import UUID4, Field, Secret
6
+
7
+ from unstructured_ingest.error import SourceConnectionError
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import (
10
+ AccessConfig,
11
+ ConnectionConfig,
12
+ Downloader,
13
+ DownloaderConfig,
14
+ DownloadResponse,
15
+ FileData,
16
+ FileDataSourceMetadata,
17
+ Indexer,
18
+ IndexerConfig,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
23
+
24
+ if TYPE_CHECKING:
25
+ from unstructured_ingest.v2.processes.connectors.notion.client import Client
26
+
27
+ NOTION_API_VERSION = "2022-06-28"
28
+ CONNECTOR_TYPE = "notion"
29
+
30
+
31
+ class NotionAccessConfig(AccessConfig):
32
+ notion_api_key: str = Field(description="Notion API key")
33
+
34
+
35
+ class NotionConnectionConfig(ConnectionConfig):
36
+ access_config: Secret[NotionAccessConfig]
37
+
38
+ @requires_dependencies(["notion_client"], extras="notion")
39
+ def get_client(self) -> "Client":
40
+ from unstructured_ingest.v2.processes.connectors.notion.client import Client
41
+
42
+ return Client(
43
+ notion_version=NOTION_API_VERSION,
44
+ auth=self.access_config.get_secret_value().notion_api_key,
45
+ logger=logger,
46
+ log_level=logger.level,
47
+ )
48
+
49
+
50
+ class NotionIndexerConfig(IndexerConfig):
51
+ page_ids: Optional[list[str]] = Field(
52
+ default=None, description="List of Notion page IDs to process"
53
+ )
54
+
55
+ database_ids: Optional[list[str]] = Field(
56
+ default=None, description="List of Notion database IDs to process"
57
+ )
58
+ recursive: bool = Field(
59
+ default=False, description="Recursively process child pages and databases"
60
+ )
61
+
62
+ def __post_init__(self):
63
+ if self.page_ids:
64
+ self.page_ids: list[UUID4] = [UUID4(p.strip()) for p in self.page_ids]
65
+
66
+ if self.database_ids:
67
+ self.database_ids: list[UUID4] = [UUID4(p.strip()) for p in self.database_ids]
68
+
69
+
70
+ @dataclass
71
+ class NotionIndexer(Indexer):
72
+ connection_config: NotionConnectionConfig
73
+ index_config: NotionIndexerConfig
74
+
75
+ def is_async(self) -> bool:
76
+ return False
77
+
78
+ def precheck(self) -> None:
79
+ """Check the connection to the Notion API."""
80
+ try:
81
+ client = self.connection_config.get_client()
82
+ # Perform a simple request to verify connection
83
+ request = client._build_request("HEAD", "users")
84
+ response = client.client.send(request)
85
+ response.raise_for_status()
86
+
87
+ except Exception as e:
88
+ logger.error(f"Failed to validate connection: {e}", exc_info=True)
89
+ raise SourceConnectionError(f"Failed to validate connection: {e}")
90
+
91
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
92
+ client = self.connection_config.get_client()
93
+ processed_pages: set[str] = set()
94
+ processed_databases: set[str] = set()
95
+
96
+ pages_to_process: set[str] = set(self.index_config.page_ids or [])
97
+ databases_to_process: set[str] = set(self.index_config.database_ids or [])
98
+
99
+ while pages_to_process or databases_to_process:
100
+ # Process pages
101
+ for page_id in list(pages_to_process):
102
+ if page_id in processed_pages:
103
+ continue
104
+
105
+ processed_pages.add(page_id)
106
+ pages_to_process.remove(page_id)
107
+ file_data = self.get_page_file_data(page_id=page_id, client=client)
108
+ if file_data:
109
+ yield file_data
110
+
111
+ if self.index_config.recursive:
112
+ (child_pages, child_databases) = self.get_child_pages_and_databases(
113
+ page_id=page_id,
114
+ client=client,
115
+ processed_pages=processed_pages,
116
+ processed_databases=processed_databases,
117
+ )
118
+ pages_to_process.update(child_pages)
119
+ databases_to_process.update(child_databases)
120
+
121
+ # Process databases
122
+ for database_id in list(databases_to_process):
123
+ if database_id in processed_databases:
124
+ continue
125
+ processed_databases.add(database_id)
126
+ databases_to_process.remove(database_id)
127
+ file_data = self.get_database_file_data(database_id=database_id, client=client)
128
+ if file_data:
129
+ yield file_data
130
+ if self.index_config.recursive:
131
+ (
132
+ child_pages,
133
+ child_databases,
134
+ ) = self.get_child_pages_and_databases_from_database(
135
+ database_id=database_id,
136
+ client=client,
137
+ processed_pages=processed_pages,
138
+ processed_databases=processed_databases,
139
+ )
140
+ pages_to_process.update(child_pages)
141
+ databases_to_process.update(child_databases)
142
+
143
+ @requires_dependencies(["notion_client"], extras="notion")
144
+ def get_page_file_data(self, page_id: str, client: "Client") -> Optional[FileData]:
145
+ try:
146
+ page_metadata = client.pages.retrieve(page_id=page_id) # type: ignore
147
+ date_created = page_metadata.created_time
148
+ date_modified = page_metadata.last_edited_time
149
+ identifier = page_id
150
+ source_identifiers = SourceIdentifiers(
151
+ filename=f"{page_id}.html",
152
+ fullpath=f"{page_id}.html",
153
+ rel_path=f"{page_id}.html",
154
+ )
155
+ metadata = FileDataSourceMetadata(
156
+ date_created=date_created,
157
+ date_modified=date_modified,
158
+ record_locator={"page_id": page_id},
159
+ date_processed=str(time()),
160
+ )
161
+ # additional_metadata = page_metadata
162
+ additional_metadata = {
163
+ "created_by": page_metadata.created_by,
164
+ "last_edited_by": page_metadata.last_edited_by,
165
+ "parent": page_metadata.parent,
166
+ "url": page_metadata.url,
167
+ }
168
+
169
+ return FileData(
170
+ identifier=identifier,
171
+ connector_type=CONNECTOR_TYPE,
172
+ source_identifiers=source_identifiers,
173
+ metadata=metadata,
174
+ additional_metadata=additional_metadata,
175
+ )
176
+ except Exception as e:
177
+ logger.error(f"Error retrieving page {page_id}: {e}")
178
+ return None
179
+
180
+ @requires_dependencies(["notion_client"], extras="notion")
181
+ def get_database_file_data(self, database_id: str, client: "Client") -> Optional[FileData]:
182
+ try:
183
+ # type: ignore
184
+ database_metadata = client.databases.retrieve(database_id=database_id)
185
+ date_created = database_metadata.created_time
186
+ date_modified = database_metadata.last_edited_time
187
+ identifier = database_id
188
+ source_identifiers = SourceIdentifiers(
189
+ filename=f"{database_id}.html",
190
+ fullpath=f"{database_id}.html",
191
+ rel_path=f"{database_id}.html",
192
+ )
193
+ metadata = FileDataSourceMetadata(
194
+ date_created=date_created,
195
+ date_modified=date_modified,
196
+ record_locator={"database_id": database_id},
197
+ date_processed=str(time()),
198
+ )
199
+ additional_metadata = {
200
+ "created_by": database_metadata.created_by,
201
+ "last_edited_by": database_metadata.last_edited_by,
202
+ "parent": database_metadata.parent,
203
+ "url": database_metadata.url,
204
+ }
205
+ return FileData(
206
+ identifier=identifier,
207
+ connector_type=CONNECTOR_TYPE,
208
+ source_identifiers=source_identifiers,
209
+ metadata=metadata,
210
+ additional_metadata=additional_metadata,
211
+ )
212
+ except Exception as e:
213
+ logger.error(f"Error retrieving database {database_id}: {e}")
214
+ return None
215
+
216
+ def get_child_pages_and_databases(
217
+ self,
218
+ page_id: str,
219
+ client: "Client",
220
+ processed_pages: set[str],
221
+ processed_databases: set[str],
222
+ ) -> tuple[set[str], set[str]]:
223
+ from unstructured_ingest.v2.processes.connectors.notion.helpers import (
224
+ get_recursive_content_from_page,
225
+ )
226
+
227
+ child_content = get_recursive_content_from_page(
228
+ client=client,
229
+ page_id=page_id,
230
+ logger=logger,
231
+ )
232
+ child_pages = set(child_content.child_pages) - processed_pages
233
+ child_databases = set(child_content.child_databases) - processed_databases
234
+ return child_pages, child_databases
235
+
236
+ def get_child_pages_and_databases_from_database(
237
+ self,
238
+ database_id: str,
239
+ client: "Client",
240
+ processed_pages: set[str],
241
+ processed_databases: set[str],
242
+ ) -> tuple[set[str], set[str]]:
243
+ from unstructured_ingest.v2.processes.connectors.notion.helpers import (
244
+ get_recursive_content_from_database,
245
+ )
246
+
247
+ child_content = get_recursive_content_from_database(
248
+ client=client,
249
+ database_id=database_id,
250
+ logger=logger,
251
+ )
252
+ child_pages = set(child_content.child_pages) - processed_pages
253
+ child_databases = set(child_content.child_databases) - processed_databases
254
+ return child_pages, child_databases
255
+
256
+ async def run_async(self, **kwargs: Any) -> AsyncGenerator[None, None]:
257
+ # Asynchronous run is not implemented
258
+ raise NotImplementedError()
259
+
260
+
261
+ class NotionDownloaderConfig(DownloaderConfig):
262
+ pass
263
+
264
+
265
+ @dataclass
266
+ class NotionDownloader(Downloader):
267
+ connection_config: NotionConnectionConfig
268
+ download_config: NotionDownloaderConfig
269
+ connector_type: str = CONNECTOR_TYPE
270
+
271
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
272
+ client = self.connection_config.get_client()
273
+ record_locator = file_data.metadata.record_locator
274
+
275
+ if "page_id" in record_locator:
276
+ return self.download_page(
277
+ client=client,
278
+ page_id=record_locator["page_id"],
279
+ file_data=file_data,
280
+ )
281
+ elif "database_id" in record_locator:
282
+ return self.download_database(
283
+ client=client,
284
+ database_id=record_locator["database_id"],
285
+ file_data=file_data,
286
+ )
287
+ else:
288
+ raise ValueError("Invalid record_locator in file_data")
289
+
290
+ def download_page(self, client, page_id: str, file_data: FileData) -> DownloadResponse:
291
+ from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_page_html
292
+
293
+ try:
294
+ text_extraction = extract_page_html(
295
+ client=client,
296
+ page_id=page_id,
297
+ logger=logger,
298
+ )
299
+
300
+ if text_extraction.html:
301
+ download_path = self.get_download_path(file_data=file_data)
302
+ download_path.parent.mkdir(parents=True, exist_ok=True)
303
+ with download_path.open("w") as page_file:
304
+ page_file.write(text_extraction.html.render(pretty=True))
305
+ return self.generate_download_response(
306
+ file_data=file_data, download_path=download_path
307
+ )
308
+ else:
309
+ logger.error(f"No HTML content for page {page_id}")
310
+ return None
311
+ except Exception as e:
312
+ logger.error(f"Error downloading page {page_id}: {e}")
313
+ return None
314
+
315
+ def download_database(self, client, database_id: str, file_data: FileData) -> DownloadResponse:
316
+ from unstructured_ingest.v2.processes.connectors.notion.helpers import extract_database_html
317
+
318
+ try:
319
+ text_extraction = extract_database_html(
320
+ client=client,
321
+ database_id=database_id,
322
+ logger=logger,
323
+ )
324
+ if text_extraction.html:
325
+ download_path = self.get_download_path(file_data=file_data)
326
+ download_path.parent.mkdir(parents=True, exist_ok=True)
327
+ with download_path.open("w") as database_file:
328
+ database_file.write(text_extraction.html.render(pretty=True))
329
+ return self.generate_download_response(
330
+ file_data=file_data, download_path=download_path
331
+ )
332
+ else:
333
+ logger.error(f"No HTML content for database {database_id}")
334
+ return None
335
+ except Exception as e:
336
+ logger.error(f"Error downloading database {database_id}: {e}")
337
+ return None
338
+
339
+
340
+ notion_source_entry = SourceRegistryEntry(
341
+ connection_config=NotionConnectionConfig,
342
+ indexer_config=NotionIndexerConfig,
343
+ indexer=NotionIndexer,
344
+ downloader_config=NotionDownloaderConfig,
345
+ downloader=NotionDownloader,
346
+ )