unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,285 @@
1
+ import copy
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+ from unstructured.__version__ import __version__ as unstructured_version
7
+
8
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
10
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError, WriteError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ BaseConnectorConfig,
14
+ BaseDestinationConnector,
15
+ BaseIngestDocBatch,
16
+ BaseSingleIngestDoc,
17
+ BaseSourceConnector,
18
+ IngestDocCleanupMixin,
19
+ SourceConnectorCleanupMixin,
20
+ SourceMetadata,
21
+ )
22
+ from unstructured_ingest.logger import logger
23
+ from unstructured_ingest.utils.data_prep import flatten_dict
24
+ from unstructured_ingest.utils.dep_check import requires_dependencies
25
+
26
+ if t.TYPE_CHECKING:
27
+ from pymongo import MongoClient
28
+
29
+
30
+ SERVER_API_VERSION = "1"
31
+
32
+
33
+ def parse_userinfo(userinfo: str) -> t.Tuple[str, str]:
34
+ user, _, passwd = userinfo.partition(":")
35
+ return user, passwd
36
+
37
+
38
+ @dataclass
39
+ class MongoDBAccessConfig(AccessConfig):
40
+ uri: t.Optional[str] = enhanced_field(sensitive=True, default=None)
41
+
42
+
43
+ @dataclass
44
+ class SimpleMongoDBConfig(BaseConnectorConfig):
45
+ access_config: MongoDBAccessConfig
46
+ host: t.Optional[str] = None
47
+ database: t.Optional[str] = None
48
+ collection: t.Optional[str] = None
49
+ port: int = 27017
50
+ batch_size: int = 100
51
+
52
+ @requires_dependencies(["pymongo"], extras="mongodb")
53
+ def generate_client(self) -> "MongoClient":
54
+ from pymongo import MongoClient
55
+ from pymongo.driver_info import DriverInfo
56
+ from pymongo.server_api import ServerApi
57
+
58
+ if self.access_config.uri:
59
+ return MongoClient(
60
+ self.access_config.uri,
61
+ server_api=ServerApi(version=SERVER_API_VERSION),
62
+ driver=DriverInfo(name="unstructured", version=unstructured_version),
63
+ )
64
+ else:
65
+ return MongoClient(
66
+ host=self.host,
67
+ port=self.port,
68
+ server_api=ServerApi(version=SERVER_API_VERSION),
69
+ )
70
+
71
+ def get_collection(self, client):
72
+ database = client[self.database]
73
+ return database.get_collection(name=self.collection)
74
+
75
+
76
+ @dataclass
77
+ class MongoDBDocumentMeta:
78
+ collection: str
79
+ document_id: str
80
+ date_created: str
81
+
82
+
83
+ @dataclass
84
+ class MongoDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
85
+ connector_config: SimpleMongoDBConfig
86
+ document_meta: MongoDBDocumentMeta
87
+ document: dict = field(default_factory=dict)
88
+ registry_name: str = "mongodb"
89
+
90
+ @property
91
+ def filename(self):
92
+ return (
93
+ Path(self.read_config.download_dir)
94
+ / self.connector_config.collection
95
+ / f"{self.document_meta.document_id}.txt"
96
+ ).resolve()
97
+
98
+ @property
99
+ def _output_filename(self):
100
+ return (
101
+ Path(self.processor_config.output_dir)
102
+ / self.connector_config.collection
103
+ / f"{self.document_meta.document_id}.json"
104
+ )
105
+
106
+ def update_source_metadata(self, **kwargs):
107
+ if self.document is None:
108
+ self.source_metadata = SourceMetadata(
109
+ exists=False,
110
+ )
111
+ return
112
+ self.source_metadata = SourceMetadata(
113
+ date_created=self.document_meta.date_created,
114
+ exists=True,
115
+ )
116
+
117
+ @SourceConnectionError.wrap
118
+ @requires_dependencies(["pymongo"], extras="mongodb")
119
+ @BaseSingleIngestDoc.skip_if_file_exists
120
+ def get_file(self):
121
+ pass
122
+
123
+ @property
124
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
125
+ return {
126
+ "host": self.connector_config.host,
127
+ "collection": self.connector_config.collection,
128
+ "document_id": self.document_meta.document_id,
129
+ }
130
+
131
+
132
+ @dataclass
133
+ class MongoDBIngestDocBatch(BaseIngestDocBatch):
134
+ connector_config: SimpleMongoDBConfig
135
+ ingest_docs: t.List[MongoDBIngestDoc] = field(default_factory=list)
136
+ list_of_ids: t.List[str] = field(default_factory=list)
137
+ registry_name: str = "mongodb_batch"
138
+
139
+ @property
140
+ def unique_id(self) -> str:
141
+ return ",".join(sorted(self.list_of_ids))
142
+
143
+ @requires_dependencies(["pymongo"], extras="mongodb")
144
+ def _get_docs(self) -> t.List[dict]:
145
+ """Fetches all documents in a collection."""
146
+ from bson.objectid import ObjectId
147
+
148
+ # Note for future. Maybe this could use other client
149
+ client = self.connector_config.generate_client()
150
+ collection = self.connector_config.get_collection(client)
151
+ # MondoDB expects a list of ObjectIds
152
+ list_of_object_ids = []
153
+ for x in self.list_of_ids:
154
+ list_of_object_ids.append(ObjectId(x))
155
+ return list(collection.find({"_id": {"$in": list_of_object_ids}}))
156
+
157
+ def get_files(self):
158
+ documents = self._get_docs()
159
+ for doc in documents:
160
+ ingest_doc = MongoDBIngestDoc(
161
+ processor_config=self.processor_config,
162
+ read_config=self.read_config,
163
+ connector_config=self.connector_config,
164
+ document_meta=MongoDBDocumentMeta(
165
+ collection=self.connector_config.collection,
166
+ document_id=str(doc.get("_id")),
167
+ date_created=doc.get("_id").generation_time.isoformat(),
168
+ ),
169
+ document=doc,
170
+ )
171
+ ingest_doc.update_source_metadata()
172
+ del doc["_id"]
173
+ filename = ingest_doc.filename
174
+ flattened_dict = flatten_dict(dictionary=doc)
175
+ str_values = [str(value) for value in flattened_dict.values()]
176
+ concatenated_values = "\n".join(str_values)
177
+
178
+ filename.parent.mkdir(parents=True, exist_ok=True)
179
+ with open(filename, "w", encoding="utf8") as f:
180
+ f.write(concatenated_values)
181
+
182
+ self.ingest_docs.append(ingest_doc)
183
+
184
+
185
+ @dataclass
186
+ class MongoDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
187
+ connector_config: SimpleMongoDBConfig
188
+ _client: t.Optional["MongoClient"] = field(init=False, default=None)
189
+
190
+ @property
191
+ def client(self) -> "MongoClient":
192
+ if self._client is None:
193
+ self._client = self.connector_config.generate_client()
194
+ return self._client
195
+
196
+ def check_connection(self):
197
+ try:
198
+ self.client.admin.command("ping")
199
+ except Exception as e:
200
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
201
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
202
+
203
+ def initialize(self):
204
+ _ = self.client
205
+
206
+ @requires_dependencies(["pymongo"], extras="mongodb")
207
+ def _get_doc_ids(self) -> t.List[str]:
208
+ """Fetches all document ids in a collection."""
209
+ collection = self.connector_config.get_collection(self.client)
210
+ return [str(x) for x in collection.distinct("_id")]
211
+
212
+ def get_ingest_docs(self):
213
+ """Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
214
+ ids = self._get_doc_ids()
215
+ id_batches = [
216
+ ids[
217
+ i
218
+ * self.connector_config.batch_size : (i + 1) # noqa
219
+ * self.connector_config.batch_size
220
+ ]
221
+ for i in range(
222
+ (len(ids) + self.connector_config.batch_size - 1)
223
+ // self.connector_config.batch_size
224
+ )
225
+ ]
226
+
227
+ return [
228
+ MongoDBIngestDocBatch(
229
+ connector_config=self.connector_config,
230
+ processor_config=self.processor_config,
231
+ read_config=self.read_config,
232
+ list_of_ids=batched_ids,
233
+ )
234
+ for batched_ids in id_batches
235
+ ]
236
+
237
+
238
+ @dataclass
239
+ class MongoDBDestinationConnector(BaseDestinationConnector):
240
+ connector_config: SimpleMongoDBConfig
241
+ _client: t.Optional["MongoClient"] = field(init=False, default=None)
242
+
243
+ def to_dict(self, **kwargs):
244
+ """
245
+ The _client variable in this dataclass breaks deepcopy due to:
246
+ TypeError: cannot pickle '_thread.lock' object
247
+ When serializing, remove it, meaning client data will need to be reinitialized
248
+ when deserialized
249
+ """
250
+ self_cp = copy.copy(self)
251
+ if hasattr(self_cp, "_client"):
252
+ setattr(self_cp, "_client", None)
253
+ return _asdict(self_cp, **kwargs)
254
+
255
+ @property
256
+ def client(self) -> "MongoClient":
257
+ if self._client is None:
258
+ self._client = self.connector_config.generate_client()
259
+ return self._client
260
+
261
+ @requires_dependencies(["pymongo"], extras="mongodb")
262
+ def check_connection(self):
263
+ try:
264
+ self.client.admin.command("ping")
265
+ except Exception as e:
266
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
267
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
268
+
269
+ def initialize(self):
270
+ _ = self.client
271
+
272
+ @requires_dependencies(["pymongo"], extras="mongodb")
273
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
274
+ logger.info(
275
+ f"writing {len(elements_dict)} documents to destination "
276
+ f"database {self.connector_config.database}, "
277
+ f"at collection {self.connector_config.collection}",
278
+ )
279
+
280
+ collection = self.connector_config.get_collection(self.client)
281
+ try:
282
+ collection.insert_many(elements_dict)
283
+ except Exception as e:
284
+ logger.error(f"failed to write records: {e}", exc_info=True)
285
+ raise WriteError(f"failed to write records: {e}")
File without changes
@@ -0,0 +1,233 @@
1
+ from typing import Any, Generator, List, Optional, Tuple
2
+
3
+ import backoff
4
+ import httpx
5
+ import notion_client.errors
6
+ from notion_client import Client as NotionClient
7
+ from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
8
+ from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint
9
+ from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
10
+ from notion_client.api_endpoints import Endpoint
11
+ from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint
12
+ from notion_client.errors import RequestTimeoutError
13
+
14
+ from unstructured_ingest.connector.notion.types.block import Block
15
+ from unstructured_ingest.connector.notion.types.database import Database
16
+ from unstructured_ingest.connector.notion.types.database_properties import (
17
+ map_cells,
18
+ )
19
+ from unstructured_ingest.connector.notion.types.page import Page
20
+ from unstructured_ingest.ingest_backoff import RetryHandler
21
+ from unstructured_ingest.interfaces import RetryStrategyConfig
22
+
23
+ retryable_exceptions = (
24
+ httpx.TimeoutException,
25
+ httpx.HTTPStatusError,
26
+ notion_client.errors.HTTPResponseError,
27
+ )
28
+
29
+
30
+ def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]:
31
+ if retry_strategy_config := getattr(endpoint, "retry_strategy_config"):
32
+ return RetryHandler(
33
+ backoff.expo,
34
+ retryable_exceptions,
35
+ max_time=retry_strategy_config.max_retry_time,
36
+ max_tries=retry_strategy_config.max_retries,
37
+ logger=endpoint.parent.logger,
38
+ start_log_level=endpoint.parent.logger.level,
39
+ backoff_log_level=endpoint.parent.logger.level,
40
+ )
41
+ return None
42
+
43
+
44
+ class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
45
+ def __init__(
46
+ self,
47
+ *args,
48
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
49
+ **kwargs,
50
+ ):
51
+ super().__init__(*args, **kwargs)
52
+ self.retry_strategy_config = retry_strategy_config
53
+
54
+ @property
55
+ def retry_handler(self) -> Optional[RetryHandler]:
56
+ return get_retry_handler(self)
57
+
58
+ def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]:
59
+ resp: dict = (
60
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
61
+ if self.retry_handler
62
+ else super().list(block_id=block_id, **kwargs)
63
+ ) # type: ignore
64
+ child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
65
+ return child_blocks, resp
66
+
67
+ def iterate_list(
68
+ self,
69
+ block_id: str,
70
+ **kwargs: Any,
71
+ ) -> Generator[List[Block], None, None]:
72
+ while True:
73
+ response: dict = (
74
+ self.retry_handler(super().list, block_id=block_id, **kwargs)
75
+ if self.retry_handler
76
+ else super().list(block_id=block_id, **kwargs)
77
+ ) # type: ignore
78
+ child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])]
79
+ yield child_blocks
80
+
81
+ next_cursor = response.get("next_cursor")
82
+ if not response.get("has_more") or not next_cursor:
83
+ return
84
+
85
+
86
+ class DatabasesEndpoint(NotionDatabasesEndpoint):
87
+ def __init__(
88
+ self,
89
+ *args,
90
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
91
+ **kwargs,
92
+ ):
93
+ super().__init__(*args, **kwargs)
94
+ self.retry_strategy_config = retry_strategy_config
95
+
96
+ @property
97
+ def retry_handler(self) -> Optional[RetryHandler]:
98
+ return get_retry_handler(self)
99
+
100
+ def retrieve(self, database_id: str, **kwargs: Any) -> Database:
101
+ resp: dict = (
102
+ self.retry_handler(super().retrieve, database_id=database_id, **kwargs)
103
+ if (self.retry_handler)
104
+ else (super().retrieve(database_id=database_id, **kwargs))
105
+ ) # type: ignore
106
+ return Database.from_dict(data=resp)
107
+
108
+ def retrieve_status(self, database_id: str, **kwargs) -> int:
109
+ request = self.parent._build_request(
110
+ method="HEAD",
111
+ path=f"databases/{database_id}",
112
+ auth=kwargs.get("auth"),
113
+ )
114
+ try:
115
+ response: httpx.Response = (
116
+ self.retry_handler(self.parent.client.send, request)
117
+ if (self.retry_handler)
118
+ else (self.parent.client.send(request))
119
+ ) # type: ignore
120
+ return response.status_code
121
+ except httpx.TimeoutException:
122
+ raise RequestTimeoutError()
123
+
124
+ def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]:
125
+ """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database.
126
+
127
+ *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)*
128
+ """ # noqa: E501
129
+ resp: dict = (
130
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
131
+ if (self.retry_handler)
132
+ else (super().query(database_id=database_id, **kwargs))
133
+ ) # type: ignore
134
+ pages = [Page.from_dict(data=p) for p in resp.pop("results")]
135
+ for p in pages:
136
+ p.properties = map_cells(p.properties)
137
+ return pages, resp
138
+
139
+ def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]:
140
+ while True:
141
+ response: dict = (
142
+ self.retry_handler(super().query, database_id=database_id, **kwargs)
143
+ if (self.retry_handler)
144
+ else (super().query(database_id=database_id, **kwargs))
145
+ ) # type: ignore
146
+ pages = [Page.from_dict(data=p) for p in response.pop("results", [])]
147
+ for p in pages:
148
+ p.properties = map_cells(p.properties)
149
+ yield pages
150
+
151
+ next_cursor = response.get("next_cursor")
152
+ if not response.get("has_more") or not next_cursor:
153
+ return
154
+
155
+
156
+ class BlocksEndpoint(NotionBlocksEndpoint):
157
+ def __init__(
158
+ self,
159
+ *args: Any,
160
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
161
+ **kwargs: Any,
162
+ ) -> None:
163
+ super().__init__(*args, **kwargs)
164
+ self.retry_strategy_config = retry_strategy_config
165
+ self.children = BlocksChildrenEndpoint(
166
+ retry_strategy_config=retry_strategy_config,
167
+ *args,
168
+ **kwargs,
169
+ )
170
+
171
+ @property
172
+ def retry_handler(self) -> Optional[RetryHandler]:
173
+ return get_retry_handler(self)
174
+
175
+ def retrieve(self, block_id: str, **kwargs: Any) -> Block:
176
+ resp: dict = (
177
+ self.retry_handler(super().retrieve, block_id=block_id, **kwargs)
178
+ if (self.retry_handler)
179
+ else (super().retrieve(block_id=block_id, **kwargs))
180
+ ) # type: ignore
181
+ return Block.from_dict(data=resp)
182
+
183
+
184
+ class PagesEndpoint(NotionPagesEndpoint):
185
+ def __init__(
186
+ self,
187
+ *args,
188
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
189
+ **kwargs,
190
+ ):
191
+ super().__init__(*args, **kwargs)
192
+ self.retry_strategy_config = retry_strategy_config
193
+
194
+ @property
195
+ def retry_handler(self) -> Optional[RetryHandler]:
196
+ return get_retry_handler(self)
197
+
198
+ def retrieve(self, page_id: str, **kwargs: Any) -> Page:
199
+ resp: dict = (
200
+ self.retry_handler(super().retrieve, page_id=page_id, **kwargs)
201
+ if (self.retry_handler)
202
+ else (super().retrieve(page_id=page_id, **kwargs))
203
+ ) # type: ignore
204
+ return Page.from_dict(data=resp)
205
+
206
+ def retrieve_status(self, page_id: str, **kwargs) -> int:
207
+ request = self.parent._build_request(
208
+ method="HEAD",
209
+ path=f"pages/{page_id}",
210
+ auth=kwargs.get("auth"),
211
+ )
212
+ try:
213
+ response: httpx.Response = (
214
+ self.retry_handler(self.parent.client.send, request)
215
+ if (self.retry_handler)
216
+ else (self.parent.client.send(request))
217
+ ) # type: ignore
218
+ return response.status_code
219
+ except httpx.TimeoutException:
220
+ raise RequestTimeoutError()
221
+
222
+
223
+ class Client(NotionClient):
224
+ def __init__(
225
+ self,
226
+ *args: Any,
227
+ retry_strategy_config: Optional[RetryStrategyConfig] = None,
228
+ **kwargs: Any,
229
+ ) -> None:
230
+ super().__init__(*args, **kwargs)
231
+ self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
232
+ self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)
233
+ self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self)