unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,284 @@
1
+ import copy
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ from unstructured_ingest.__version__ import __version__ as unstructured_version
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
9
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
10
+ from unstructured_ingest.interfaces import (
11
+ AccessConfig,
12
+ BaseConnectorConfig,
13
+ BaseDestinationConnector,
14
+ BaseIngestDocBatch,
15
+ BaseSingleIngestDoc,
16
+ BaseSourceConnector,
17
+ IngestDocCleanupMixin,
18
+ SourceConnectorCleanupMixin,
19
+ SourceMetadata,
20
+ )
21
+ from unstructured_ingest.logger import logger
22
+ from unstructured_ingest.utils.data_prep import flatten_dict
23
+ from unstructured_ingest.utils.dep_check import requires_dependencies
24
+
25
+ if t.TYPE_CHECKING:
26
+ from pymongo import MongoClient
27
+
28
+
29
+ SERVER_API_VERSION = "1"
30
+
31
+
32
+ def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
33
+ user, _, passwd = userinfo.partition(":")
34
+ return user, passwd
35
+
36
+
37
+ @dataclass
38
+ class MongoDBAccessConfig(AccessConfig):
39
+ uri: t.Optional[str] = enhanced_field(sensitive=True, default=None)
40
+
41
+
42
+ @dataclass
43
+ class SimpleMongoDBConfig(BaseConnectorConfig):
44
+ access_config: MongoDBAccessConfig
45
+ host: t.Optional[str] = None
46
+ database: t.Optional[str] = None
47
+ collection: t.Optional[str] = None
48
+ port: int = 27017
49
+ batch_size: int = 100
50
+
51
+ @requires_dependencies(["pymongo"], extras="mongodb")
52
+ def generate_client(self) -> "MongoClient":
53
+ from pymongo import MongoClient
54
+ from pymongo.driver_info import DriverInfo
55
+ from pymongo.server_api import ServerApi
56
+
57
+ if self.access_config.uri:
58
+ return MongoClient(
59
+ self.access_config.uri,
60
+ server_api=ServerApi(version=SERVER_API_VERSION),
61
+ driver=DriverInfo(name="unstructured", version=unstructured_version),
62
+ )
63
+ else:
64
+ return MongoClient(
65
+ host=self.host,
66
+ port=self.port,
67
+ server_api=ServerApi(version=SERVER_API_VERSION),
68
+ )
69
+
70
+ def get_collection(self, client):
71
+ database = client[self.database]
72
+ return database.get_collection(name=self.collection)
73
+
74
+
75
+ @dataclass
76
+ class MongoDBDocumentMeta:
77
+ collection: str
78
+ document_id: str
79
+ date_created: str
80
+
81
+
82
+ @dataclass
83
+ class MongoDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
84
+ connector_config: SimpleMongoDBConfig
85
+ document_meta: MongoDBDocumentMeta
86
+ document: dict = field(default_factory=dict)
87
+ registry_name: str = "mongodb"
88
+
89
+ @property
90
+ def filename(self):
91
+ return (
92
+ Path(self.read_config.download_dir)
93
+ / self.connector_config.collection
94
+ / f"{self.document_meta.document_id}.txt"
95
+ ).resolve()
96
+
97
+ @property
98
+ def _output_filename(self):
99
+ return (
100
+ Path(self.processor_config.output_dir)
101
+ / self.connector_config.collection
102
+ / f"{self.document_meta.document_id}.json"
103
+ )
104
+
105
+ def update_source_metadata(self, **kwargs):
106
+ if self.document is None:
107
+ self.source_metadata = SourceMetadata(
108
+ exists=False,
109
+ )
110
+ return
111
+ self.source_metadata = SourceMetadata(
112
+ date_created=self.document_meta.date_created,
113
+ exists=True,
114
+ )
115
+
116
+ @SourceConnectionError.wrap
117
+ @requires_dependencies(["pymongo"], extras="mongodb")
118
+ @BaseSingleIngestDoc.skip_if_file_exists
119
+ def get_file(self):
120
+ pass
121
+
122
+ @property
123
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
124
+ return {
125
+ "host": self.connector_config.host,
126
+ "collection": self.connector_config.collection,
127
+ "document_id": self.document_meta.document_id,
128
+ }
129
+
130
+
131
+ @dataclass
132
+ class MongoDBIngestDocBatch(BaseIngestDocBatch):
133
+ connector_config: SimpleMongoDBConfig
134
+ ingest_docs: t.List[MongoDBIngestDoc] = field(default_factory=list)
135
+ list_of_ids: t.List[str] = field(default_factory=list)
136
+ registry_name: str = "mongodb_batch"
137
+
138
+ @property
139
+ def unique_id(self) -> str:
140
+ return ",".join(sorted(self.list_of_ids))
141
+
142
+ @requires_dependencies(["pymongo"], extras="mongodb")
143
+ def _get_docs(self) -> t.List[dict]:
144
+ """Fetches all documents in a collection."""
145
+ from bson.objectid import ObjectId
146
+
147
+ # Note for future. Maybe this could use other client
148
+ client = self.connector_config.generate_client()
149
+ collection = self.connector_config.get_collection(client)
150
+ # MondoDB expects a list of ObjectIds
151
+ list_of_object_ids = []
152
+ for x in self.list_of_ids:
153
+ list_of_object_ids.append(ObjectId(x))
154
+ return list(collection.find({"_id": {"$in": list_of_object_ids}}))
155
+
156
+ def get_files(self):
157
+ documents = self._get_docs()
158
+ for doc in documents:
159
+ ingest_doc = MongoDBIngestDoc(
160
+ processor_config=self.processor_config,
161
+ read_config=self.read_config,
162
+ connector_config=self.connector_config,
163
+ document_meta=MongoDBDocumentMeta(
164
+ collection=self.connector_config.collection,
165
+ document_id=str(doc.get("_id")),
166
+ date_created=doc.get("_id").generation_time.isoformat(),
167
+ ),
168
+ document=doc,
169
+ )
170
+ ingest_doc.update_source_metadata()
171
+ del doc["_id"]
172
+ filename = ingest_doc.filename
173
+ flattened_dict = flatten_dict(dictionary=doc)
174
+ str_values = [str(value) for value in flattened_dict.values()]
175
+ concatenated_values = "\n".join(str_values)
176
+
177
+ filename.parent.mkdir(parents=True, exist_ok=True)
178
+ with open(filename, "w", encoding="utf8") as f:
179
+ f.write(concatenated_values)
180
+
181
+ self.ingest_docs.append(ingest_doc)
182
+
183
+
184
+ @dataclass
185
+ class MongoDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
186
+ connector_config: SimpleMongoDBConfig
187
+ _client: t.Optional["MongoClient"] = field(init=False, default=None)
188
+
189
+ @property
190
+ def client(self) -> "MongoClient":
191
+ if self._client is None:
192
+ self._client = self.connector_config.generate_client()
193
+ return self._client
194
+
195
+ def check_connection(self):
196
+ try:
197
+ self.client.admin.command("ping")
198
+ except Exception as e:
199
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
200
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
201
+
202
+ def initialize(self):
203
+ _ = self.client
204
+
205
+ @requires_dependencies(["pymongo"], extras="mongodb")
206
+ def _get_doc_ids(self) -> t.List[str]:
207
+ """Fetches all document ids in a collection."""
208
+ collection = self.connector_config.get_collection(self.client)
209
+ return [str(x) for x in collection.distinct("_id")]
210
+
211
+ def get_ingest_docs(self):
212
+ """Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
213
+ ids = self._get_doc_ids()
214
+ id_batches = [
215
+ ids[
216
+ i
217
+ * self.connector_config.batch_size : (i + 1) # noqa
218
+ * self.connector_config.batch_size
219
+ ]
220
+ for i in range(
221
+ (len(ids) + self.connector_config.batch_size - 1)
222
+ // self.connector_config.batch_size
223
+ )
224
+ ]
225
+
226
+ return [
227
+ MongoDBIngestDocBatch(
228
+ connector_config=self.connector_config,
229
+ processor_config=self.processor_config,
230
+ read_config=self.read_config,
231
+ list_of_ids=batched_ids,
232
+ )
233
+ for batched_ids in id_batches
234
+ ]
235
+
236
+
237
+ @dataclass
238
+ class MongoDBDestinationConnector(BaseDestinationConnector):
239
+ connector_config: SimpleMongoDBConfig
240
+ _client: t.Optional["MongoClient"] = field(init=False, default=None)
241
+
242
+ def to_dict(self, **kwargs):
243
+ """
244
+ The _client variable in this dataclass breaks deepcopy due to:
245
+ TypeError: cannot pickle '_thread.lock' object
246
+ When serializing, remove it, meaning client data will need to be reinitialized
247
+ when deserialized
248
+ """
249
+ self_cp = copy.copy(self)
250
+ if hasattr(self_cp, "_client"):
251
+ setattr(self_cp, "_client", None)
252
+ return _asdict(self_cp, **kwargs)
253
+
254
+ @property
255
+ def client(self) -> "MongoClient":
256
+ if self._client is None:
257
+ self._client = self.connector_config.generate_client()
258
+ return self._client
259
+
260
+ @requires_dependencies(["pymongo"], extras="mongodb")
261
+ def check_connection(self):
262
+ try:
263
+ self.client.admin.command("ping")
264
+ except Exception as e:
265
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
266
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
267
+
268
+ def initialize(self):
269
+ _ = self.client
270
+
271
+ @requires_dependencies(["pymongo"], extras="mongodb")
272
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
273
+ logger.info(
274
+ f"writing {len(elements_dict)} documents to destination "
275
+ f"database {self.connector_config.database}, "
276
+ f"at collection {self.connector_config.collection}",
277
+ )
278
+
279
+ collection = self.connector_config.get_collection(self.client)
280
+ try:
281
+ collection.insert_many(elements_dict)
282
+ except Exception as e:
283
+ logger.error(f"failed to write records: {e}", exc_info=True)
284
+ raise WriteError(f"failed to write records: {e}")
File without changes
@@ -0,0 +1,248 @@
1
+ from typing import Any, Generator, List, Optional, Tuple
2
+
3
+ import notion_client.errors
4
+ from notion_client import Client as NotionClient
5
+ from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
6
+ from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint
7
+ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
8
+ from notion_client.api_endpoints import Endpoint
9
+ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint
10
+ from notion_client.errors import RequestTimeoutError
11
+
12
+ from unstructured_ingest.connector.notion.types.block import Block
13
+ from unstructured_ingest.connector.notion.types.database import Database
14
+ from unstructured_ingest.connector.notion.types.database_properties import (
15
+ map_cells,
16
+ )
17
+ from unstructured_ingest.connector.notion.types.page import Page
18
+ from unstructured_ingest.ingest_backoff import RetryHandler
19
+ from unstructured_ingest.interfaces import RetryStrategyConfig
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
+
22
+
23
+ @requires_dependencies(["httpx"], extras="notion")
24
+ def _get_retry_strategy(
25
+ endpoint: Endpoint, retry_strategy_config: RetryStrategyConfig
26
+ ) -> RetryHandler:
27
+ import backoff
28
+ import httpx
29
+
30
+ retryable_exceptions = (
31
+ httpx.TimeoutException,
32
+ httpx.HTTPStatusError,
33
+ notion_client.errors.HTTPResponseError,
34
+ )
35
+
36
+ return RetryHandler(
37
+ backoff.expo,
38
+ retryable_exceptions,
39
+ max_time=retry_strategy_config.max_retry_time,
40
+ max_tries=retry_strategy_config.max_retries,
41
+ logger=endpoint.parent.logger,
42
+ start_log_level=endpoint.parent.logger.level,
43
+ backoff_log_level=endpoint.parent.logger.level,
44
+ )
45
+
46
+
47
+ def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
48
+ if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
49
+ return _get_retry_strategy(endpoint=endpoint, retry_strategy_config=retry_strategy_config)
50
+ return None
51
+
52
+
53
+ class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
54
+ def __init__(
55
+ self,
56
+ *args,
57
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
58
+ **kwargs,
59
+ ):
60
+ super().__init__(*args, **kwargs)
61
+ self.retry_strategy_config = retry_strategy_config
62
+
63
+ @property
64
+ def retry_handler(self) -> Optional[RetryHandler]:
65
+ return get_retry_handler(self)
66
+
67
+ def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]:
68
+ resp: dict = (
69
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
70
+ if self.retry_handler
71
+ else super().list(block_id=block_id, **kwargs)
72
+ ) # type: ignore
73
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
74
+ return child_blocks, resp
75
+
76
+ def iterate_list(
77
+ self,
78
+ block_id: str,
79
+ **kwargs: Any,
80
+ ) -> Generator[List[Block], None, None]:
81
+ while True:
82
+ response: dict = (
83
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
84
+ if self.retry_handler
85
+ else super().list(block_id=block_id, **kwargs)
86
+ ) # type: ignore
87
+ child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])]
88
+ yield child_blocks
89
+
90
+ next_cursor = response.get("next_cursor")
91
+ if not response.get("has_more") or not next_cursor:
92
+ return
93
+
94
+
95
+ class DatabasesEndpoint(NotionDatabasesEndpoint):
96
+ def __init__(
97
+ self,
98
+ *args,
99
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
100
+ **kwargs,
101
+ ):
102
+ super().__init__(*args, **kwargs)
103
+ self.retry_strategy_config = retry_strategy_config
104
+
105
+ @property
106
+ def retry_handler(self) -> Optional[RetryHandler]:
107
+ return get_retry_handler(self)
108
+
109
+ def retrieve(self, database_id: str, **kwargs: Any) -> Database:
110
+ resp: dict = (
111
+ self.retry_handler(super().retrieve, database_id=database_id, **kwargs)
112
+ if (self.retry_handler)
113
+ else (super().retrieve(database_id=database_id, **kwargs))
114
+ ) # type: ignore
115
+ return Database.from_dict(data=resp)
116
+
117
+ @requires_dependencies(["httpx"], extras="notion")
118
+ def retrieve_status(self, database_id: str, **kwargs) -> int:
119
+ import httpx
120
+
121
+ request = self.parent._build_request(
122
+ method="HEAD",
123
+ path=f"databases/{database_id}",
124
+ auth=kwargs.get("auth"),
125
+ )
126
+ try:
127
+ response: httpx.Response = (
128
+ self.retry_handler(self.parent.client.send, request)
129
+ if (self.retry_handler)
130
+ else (self.parent.client.send(request))
131
+ ) # type: ignore
132
+ return response.status_code
133
+ except httpx.TimeoutException:
134
+ raise RequestTimeoutError()
135
+
136
+ def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]:
137
+ """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database.
138
+
139
+ *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)*
140
+ """ # noqa: E501
141
+ resp: dict = (
142
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
143
+ if (self.retry_handler)
144
+ else (super().query(database_id=database_id, **kwargs))
145
+ ) # type: ignore
146
+ pages = [Page.from_dict(data=p) for p in resp.pop("results")]
147
+ for p in pages:
148
+ p.properties = map_cells(p.properties)
149
+ return pages, resp
150
+
151
+ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]:
152
+ while True:
153
+ response: dict = (
154
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
155
+ if (self.retry_handler)
156
+ else (super().query(database_id=database_id, **kwargs))
157
+ ) # type: ignore
158
+ pages = [Page.from_dict(data=p) for p in response.pop("results", [])]
159
+ for p in pages:
160
+ p.properties = map_cells(p.properties)
161
+ yield pages
162
+
163
+ next_cursor = response.get("next_cursor")
164
+ if not response.get("has_more") or not next_cursor:
165
+ return
166
+
167
+
168
+ class BlocksEndpoint(NotionBlocksEndpoint):
169
+ def __init__(
170
+ self,
171
+ *args: Any,
172
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
173
+ **kwargs: Any,
174
+ ) -> None:
175
+ super().__init__(*args, **kwargs)
176
+ self.retry_strategy_config = retry_strategy_config
177
+ self.children = BlocksChildrenEndpoint(
178
+ retry_strategy_config=retry_strategy_config,
179
+ *args,
180
+ **kwargs,
181
+ )
182
+
183
+ @property
184
+ def retry_handler(self) -> Optional[RetryHandler]:
185
+ return get_retry_handler(self)
186
+
187
+ def retrieve(self, block_id: str, **kwargs: Any) -> Block:
188
+ resp: dict = (
189
+ self.retry_handler(super().retrieve, block_id=block_id, **kwargs)
190
+ if (self.retry_handler)
191
+ else (super().retrieve(block_id=block_id, **kwargs))
192
+ ) # type: ignore
193
+ return Block.from_dict(data=resp)
194
+
195
+
196
+ class PagesEndpoint(NotionPagesEndpoint):
197
+ def __init__(
198
+ self,
199
+ *args,
200
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
201
+ **kwargs,
202
+ ):
203
+ super().__init__(*args, **kwargs)
204
+ self.retry_strategy_config = retry_strategy_config
205
+
206
+ @property
207
+ def retry_handler(self) -> Optional[RetryHandler]:
208
+ return get_retry_handler(self)
209
+
210
+ def retrieve(self, page_id: str, **kwargs: Any) -> Page:
211
+ resp: dict = (
212
+ self.retry_handler(super().retrieve, page_id=page_id, **kwargs)
213
+ if (self.retry_handler)
214
+ else (super().retrieve(page_id=page_id, **kwargs))
215
+ ) # type: ignore
216
+ return Page.from_dict(data=resp)
217
+
218
+ @requires_dependencies(["httpx"], extras="notion")
219
+ def retrieve_status(self, page_id: str, **kwargs) -> int:
220
+ import httpx
221
+
222
+ request = self.parent._build_request(
223
+ method="HEAD",
224
+ path=f"pages/{page_id}",
225
+ auth=kwargs.get("auth"),
226
+ )
227
+ try:
228
+ response: httpx.Response = (
229
+ self.retry_handler(self.parent.client.send, request)
230
+ if (self.retry_handler)
231
+ else (self.parent.client.send(request))
232
+ ) # type: ignore
233
+ return response.status_code
234
+ except httpx.TimeoutException:
235
+ raise RequestTimeoutError()
236
+
237
+
238
+ class Client(NotionClient):
239
+ def __init__(
240
+ self,
241
+ *args: Any,
242
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
243
+ **kwargs: Any,
244
+ ) -> None:
245
+ super().__init__(*args, **kwargs)
246
+ self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
247
+ self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
248
+ self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)