unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,335 @@
1
+ import io
2
+ import os
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
6
+
7
+ from dateutil import parser
8
+ from unstructured.documents.elements import DataSourceMetadata
9
+ from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
10
+
11
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
12
+ from unstructured_ingest.error import SourceConnectionNetworkError
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ Downloader,
19
+ DownloaderConfig,
20
+ FileData,
21
+ Indexer,
22
+ IndexerConfig,
23
+ SourceIdentifiers,
24
+ download_responses,
25
+ )
26
+ from unstructured_ingest.v2.logger import logger
27
+ from unstructured_ingest.v2.processes.connector_registry import (
28
+ SourceRegistryEntry,
29
+ )
30
+
31
+ CONNECTOR_TYPE = "google_drive"
32
+
33
+ if TYPE_CHECKING:
34
+ from googleapiclient.discovery import Resource as GoogleAPIResource
35
+ from googleapiclient.http import MediaIoBaseDownload
36
+
37
+
38
+ @dataclass
39
+ class GoogleDriveAccessConfig(AccessConfig):
40
+ service_account_key: Union[str, dict]
41
+
42
+
43
+ @dataclass
44
+ class GoogleDriveConnectionConfig(ConnectionConfig):
45
+ drive_id: str
46
+ access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True)
47
+
48
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
49
+ def get_files_service(self) -> "GoogleAPIResource":
50
+ from google.auth import default, exceptions
51
+ from google.oauth2 import service_account
52
+ from googleapiclient.discovery import build
53
+ from googleapiclient.errors import HttpError
54
+
55
+ # Service account key can be a dict or a file path(str)
56
+ # But the dict may come in as a string
57
+ if isinstance(self.access_config.service_account_key, str):
58
+ key_path = json_to_dict(self.access_config.service_account_key)
59
+ elif isinstance(self.access_config.service_account_key, dict):
60
+ key_path = self.access_config.service_account_key
61
+ else:
62
+ raise TypeError(
63
+ f"access_config.service_account_key must be "
64
+ f"str or dict, got: {type(self.access_config.service_account_key)}"
65
+ )
66
+
67
+ try:
68
+ if isinstance(key_path, dict):
69
+ creds = service_account.Credentials.from_service_account_info(key_path)
70
+ elif isinstance(key_path, str):
71
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
72
+ creds, _ = default()
73
+ else:
74
+ raise ValueError(
75
+ f"key path not recognized as a dictionary or a file path: "
76
+ f"[{type(key_path)}] {key_path}",
77
+ )
78
+ service = build("drive", "v3", credentials=creds)
79
+ return service.files()
80
+
81
+ except HttpError as exc:
82
+ raise ValueError(f"{exc.reason}")
83
+ except exceptions.DefaultCredentialsError:
84
+ raise ValueError("The provided API key is invalid.")
85
+
86
+
87
+ @dataclass
88
+ class GoogleDriveIndexerConfig(IndexerConfig):
89
+ extensions: Optional[list[str]] = None
90
+ recursive: bool = False
91
+
92
+ def __post_init__(self):
93
+ # Strip leading period of extension
94
+ if self.extensions is not None:
95
+ self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions]
96
+
97
+
98
+ @dataclass
99
+ class GoogleDriveIndexer(Indexer):
100
+ connection_config: GoogleDriveConnectionConfig
101
+ index_config: GoogleDriveIndexerConfig
102
+ fields: list[str] = field(
103
+ default_factory=lambda: [
104
+ "id",
105
+ "name",
106
+ "mimeType",
107
+ "fileExtension",
108
+ "md5Checksum",
109
+ "sha1Checksum",
110
+ "sha256Checksum",
111
+ "headRevisionId",
112
+ "permissions",
113
+ "createdTime",
114
+ "modifiedTime",
115
+ "version",
116
+ "originalFilename",
117
+ "capabilities",
118
+ "permissionIds",
119
+ "webViewLink",
120
+ "webContentLink",
121
+ ]
122
+ )
123
+
124
+ @staticmethod
125
+ def is_dir(record: dict) -> bool:
126
+ return record.get("mimeType") == "application/vnd.google-apps.folder"
127
+
128
+ @staticmethod
129
+ def map_file_data(f: dict) -> FileData:
130
+ file_id = f["id"]
131
+ filename = f.pop("name")
132
+ url = f.pop("webContentLink", None)
133
+ version = f.pop("version", None)
134
+ permissions = f.pop("permissions", None)
135
+ date_created_str = f.pop("createdTime", None)
136
+ date_created_dt = parser.parse(date_created_str) if date_created_str else None
137
+ date_modified_str = f.pop("modifiedTime", None)
138
+ parent_path = f.pop("parent_path", None)
139
+ parent_root_path = f.pop("parent_root_path", None)
140
+ date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
141
+ if (
142
+ parent_path
143
+ and isinstance(parent_path, str)
144
+ and parent_root_path
145
+ and isinstance(parent_root_path, str)
146
+ ):
147
+ fullpath = f"{parent_path}/{filename}"
148
+ rel_path = fullpath.replace(parent_root_path, "")
149
+ source_identifiers = SourceIdentifiers(
150
+ filename=filename, fullpath=fullpath, rel_path=rel_path
151
+ )
152
+ else:
153
+ source_identifiers = SourceIdentifiers(fullpath=filename, filename=filename)
154
+ return FileData(
155
+ connector_type=CONNECTOR_TYPE,
156
+ identifier=file_id,
157
+ source_identifiers=source_identifiers,
158
+ metadata=DataSourceMetadata(
159
+ url=url,
160
+ version=version,
161
+ date_created=str(date_created_dt.timestamp()),
162
+ date_modified=str(date_modified_dt.timestamp()),
163
+ permissions_data=permissions,
164
+ record_locator={
165
+ "file_id": file_id,
166
+ },
167
+ ),
168
+ additional_metadata=f,
169
+ )
170
+
171
+ def get_paginated_results(
172
+ self,
173
+ files_client,
174
+ object_id: str,
175
+ extensions: Optional[list[str]] = None,
176
+ recursive: bool = False,
177
+ previous_path: Optional[str] = None,
178
+ ) -> list[dict]:
179
+
180
+ fields_input = "nextPageToken, files({})".format(",".join(self.fields))
181
+ q = f"'{object_id}' in parents"
182
+ # Filter by extension but still include any directories
183
+ if extensions:
184
+ ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
185
+ q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
186
+ logger.debug(f"Query used when indexing: {q}")
187
+ logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
188
+ done = False
189
+ page_token = None
190
+ files_response = []
191
+ while not done:
192
+ response: dict = files_client.list(
193
+ spaces="drive",
194
+ fields=fields_input,
195
+ corpora="user",
196
+ pageToken=page_token,
197
+ q=q,
198
+ ).execute()
199
+ if files := response.get("files", []):
200
+ fs = [f for f in files if not self.is_dir(record=f)]
201
+ for r in fs:
202
+ r["parent_path"] = previous_path
203
+ dirs = [f for f in files if self.is_dir(record=f)]
204
+ files_response.extend(fs)
205
+ if recursive:
206
+ for d in dirs:
207
+ dir_id = d["id"]
208
+ dir_name = d["name"]
209
+ files_response.extend(
210
+ self.get_paginated_results(
211
+ files_client=files_client,
212
+ object_id=dir_id,
213
+ extensions=extensions,
214
+ recursive=recursive,
215
+ previous_path=f"{previous_path}/{dir_name}",
216
+ )
217
+ )
218
+ page_token = response.get("nextPageToken")
219
+ if page_token is None:
220
+ done = True
221
+ for r in files_response:
222
+ r["parent_root_path"] = previous_path
223
+ return files_response
224
+
225
+ def get_root_info(self, files_client, object_id: str) -> dict:
226
+ return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute()
227
+
228
+ def get_files(
229
+ self,
230
+ files_client,
231
+ object_id: str,
232
+ recursive: bool = False,
233
+ extensions: Optional[list[str]] = None,
234
+ ) -> list[FileData]:
235
+ root_info = self.get_root_info(files_client=files_client, object_id=object_id)
236
+ if not self.is_dir(root_info):
237
+ data = [self.map_file_data(root_info)]
238
+ else:
239
+
240
+ file_contents = self.get_paginated_results(
241
+ files_client=files_client,
242
+ object_id=object_id,
243
+ extensions=extensions,
244
+ recursive=recursive,
245
+ previous_path=root_info["name"],
246
+ )
247
+ data = [self.map_file_data(f=f) for f in file_contents]
248
+ for d in data:
249
+ d.metadata.record_locator["drive_id"]: object_id
250
+ return data
251
+
252
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
253
+ for f in self.get_files(
254
+ files_client=self.connection_config.get_files_service(),
255
+ object_id=self.connection_config.drive_id,
256
+ recursive=self.index_config.recursive,
257
+ extensions=self.index_config.extensions,
258
+ ):
259
+ yield f
260
+
261
+
262
+ @dataclass
263
+ class GoogleDriveDownloaderConfig(DownloaderConfig):
264
+ pass
265
+
266
+
267
+ @dataclass
268
+ class GoogleDriveDownloader(Downloader):
269
+ connection_config: GoogleDriveConnectionConfig
270
+ download_config: GoogleDriveDownloaderConfig = field(
271
+ default_factory=lambda: GoogleDriveDownloaderConfig()
272
+ )
273
+ connector_type: str = CONNECTOR_TYPE
274
+
275
+ def get_download_path(self, file_data: FileData) -> Path:
276
+ rel_path = file_data.source_identifiers.relative_path
277
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
+ return self.download_dir / Path(rel_path)
279
+
280
+ @SourceConnectionNetworkError.wrap
281
+ def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
282
+ downloaded = False
283
+ while downloaded is False:
284
+ _, downloaded = downloader.next_chunk()
285
+ return downloaded
286
+
287
+ def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
288
+ download_path = self.get_download_path(file_data=file_data)
289
+ download_path.parent.mkdir(parents=True, exist_ok=True)
290
+ logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
291
+ with open(download_path, "wb") as handler:
292
+ handler.write(file_contents.getbuffer())
293
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
294
+
295
+ @requires_dependencies(["googleapiclient"], extras="google-drive")
296
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
297
+ from googleapiclient.http import MediaIoBaseDownload
298
+
299
+ logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
300
+ mime_type = file_data.additional_metadata["mimeType"]
301
+ record_id = file_data.identifier
302
+ files_client = self.connection_config.get_files_service()
303
+ if mime_type.startswith("application/vnd.google-apps"):
304
+ export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
305
+ self.meta.get("mimeType"), # type: ignore
306
+ )
307
+ if not export_mime:
308
+ raise TypeError(
309
+ f"File not supported. Name: {file_data.source_identifiers.filename} "
310
+ f"ID: {record_id} "
311
+ f"MimeType: {mime_type}"
312
+ )
313
+
314
+ request = files_client.export_media(
315
+ fileId=record_id,
316
+ mimeType=export_mime,
317
+ )
318
+ else:
319
+ request = files_client.get_media(fileId=record_id)
320
+
321
+ file_contents = io.BytesIO()
322
+ downloader = MediaIoBaseDownload(file_contents, request)
323
+ downloaded = self._get_content(downloader=downloader)
324
+ if not downloaded or not file_contents:
325
+ return []
326
+ return self._write_file(file_data=file_data, file_contents=file_contents)
327
+
328
+
329
+ google_drive_source_entry = SourceRegistryEntry(
330
+ connection_config=GoogleDriveConnectionConfig,
331
+ indexer_config=GoogleDriveIndexerConfig,
332
+ indexer=GoogleDriveIndexer,
333
+ downloader_config=GoogleDriveDownloaderConfig,
334
+ downloader=GoogleDriveDownloader,
335
+ )
@@ -0,0 +1,204 @@
1
+ import glob
2
+ import itertools
3
+ import shutil
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import Any, Generator, Optional
8
+
9
+ from unstructured.documents.elements import DataSourceMetadata
10
+
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ Downloader,
15
+ DownloaderConfig,
16
+ DownloadResponse,
17
+ FileData,
18
+ Indexer,
19
+ IndexerConfig,
20
+ SourceIdentifiers,
21
+ UploadContent,
22
+ Uploader,
23
+ UploaderConfig,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import (
27
+ DestinationRegistryEntry,
28
+ SourceRegistryEntry,
29
+ )
30
+
31
+ CONNECTOR_TYPE = "local"
32
+
33
+
34
+ @dataclass
35
+ class LocalAccessConfig(AccessConfig):
36
+ pass
37
+
38
+
39
+ @dataclass
40
+ class LocalConnectionConfig(ConnectionConfig):
41
+ access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig())
42
+
43
+
44
+ @dataclass
45
+ class LocalIndexerConfig(IndexerConfig):
46
+ input_path: str
47
+ recursive: bool = False
48
+ file_glob: Optional[list[str]] = None
49
+
50
+ @property
51
+ def path(self) -> Path:
52
+ return Path(self.input_path).resolve()
53
+
54
+
55
+ @dataclass
56
+ class LocalIndexer(Indexer):
57
+ index_config: LocalIndexerConfig
58
+ connection_config: LocalConnectionConfig = field(
59
+ default_factory=lambda: LocalConnectionConfig()
60
+ )
61
+ connector_type: str = CONNECTOR_TYPE
62
+
63
+ def list_files(self) -> list[Path]:
64
+ input_path = self.index_config.path
65
+ if input_path.is_file():
66
+ return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
67
+ glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
+ if not self.index_config.file_glob:
69
+ return list(glob_fn("*"))
70
+ return list(
71
+ itertools.chain.from_iterable(
72
+ glob_fn(pattern) for pattern in self.index_config.file_glob
73
+ )
74
+ )
75
+
76
+ def get_file_metadata(self, path: Path) -> DataSourceMetadata:
77
+ stats = path.stat()
78
+ try:
79
+ date_modified = str(stats.st_mtime)
80
+ except Exception as e:
81
+ logger.warning(f"Couldn't detect date modified: {e}")
82
+ date_modified = None
83
+
84
+ try:
85
+ date_created = str(stats.st_birthtime)
86
+ except Exception as e:
87
+ logger.warning(f"Couldn't detect date created: {e}")
88
+ date_created = None
89
+
90
+ try:
91
+ mode = stats.st_mode
92
+ permissions_data = [{"mode": mode}]
93
+ except Exception as e:
94
+ logger.warning(f"Couldn't detect file mode: {e}")
95
+ permissions_data = None
96
+ return DataSourceMetadata(
97
+ date_modified=date_modified,
98
+ date_created=date_created,
99
+ date_processed=str(time()),
100
+ permissions_data=permissions_data,
101
+ record_locator={"path": str(path.resolve())},
102
+ )
103
+
104
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
105
+ for file_path in self.list_files():
106
+ file_data = FileData(
107
+ identifier=str(file_path.resolve()),
108
+ connector_type=CONNECTOR_TYPE,
109
+ source_identifiers=SourceIdentifiers(
110
+ fullpath=str(file_path.resolve()),
111
+ filename=file_path.name,
112
+ rel_path=(
113
+ str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
114
+ 1:
115
+ ]
116
+ if not self.index_config.path.is_file()
117
+ else self.index_config.path.name
118
+ ),
119
+ ),
120
+ metadata=self.get_file_metadata(path=file_path),
121
+ )
122
+ yield file_data
123
+
124
+
125
+ @dataclass
126
+ class LocalDownloaderConfig(DownloaderConfig):
127
+ pass
128
+
129
+
130
+ @dataclass
131
+ class LocalDownloader(Downloader):
132
+ connector_type: str = CONNECTOR_TYPE
133
+ connection_config: LocalConnectionConfig = field(
134
+ default_factory=lambda: LocalConnectionConfig()
135
+ )
136
+ download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
137
+
138
+ def get_download_path(self, file_data: FileData) -> Path:
139
+ return Path(file_data.source_identifiers.fullpath)
140
+
141
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
142
+ return DownloadResponse(
143
+ file_data=file_data, path=Path(file_data.source_identifiers.fullpath)
144
+ )
145
+
146
+
147
+ @dataclass
148
+ class LocalUploaderConfig(UploaderConfig):
149
+ output_dir: str = field(default="structured-output")
150
+
151
+ @property
152
+ def output_path(self) -> Path:
153
+ return Path(self.output_dir).resolve()
154
+
155
+ def __post_init__(self):
156
+ if self.output_path.exists() and self.output_path.is_file():
157
+ raise ValueError("output path already exists as a file")
158
+
159
+
160
+ @dataclass
161
+ class LocalUploader(Uploader):
162
+ connector_type: str = CONNECTOR_TYPE
163
+ upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig())
164
+ connection_config: LocalConnectionConfig = field(
165
+ default_factory=lambda: LocalConnectionConfig()
166
+ )
167
+
168
+ def is_async(self) -> bool:
169
+ return False
170
+
171
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
172
+ self.upload_config.output_path.mkdir(parents=True, exist_ok=True)
173
+ for content in contents:
174
+ if source_identifiers := content.file_data.source_identifiers:
175
+ identifiers = source_identifiers
176
+ rel_path = (
177
+ identifiers.relative_path[1:]
178
+ if identifiers.relative_path.startswith("/")
179
+ else identifiers.relative_path
180
+ )
181
+ new_path = self.upload_config.output_path / Path(rel_path)
182
+ final_path = str(new_path).replace(
183
+ identifiers.filename, f"{identifiers.filename}.json"
184
+ )
185
+ else:
186
+ final_path = self.upload_config.output_path / Path(
187
+ f"{content.file_data.identifier}.json"
188
+ )
189
+ Path(final_path).parent.mkdir(parents=True, exist_ok=True)
190
+ logger.debug(f"copying file from {content.path} to {final_path}")
191
+ shutil.copy(src=str(content.path), dst=str(final_path))
192
+
193
+
194
+ local_source_entry = SourceRegistryEntry(
195
+ indexer=LocalIndexer,
196
+ indexer_config=LocalIndexerConfig,
197
+ downloader=LocalDownloader,
198
+ downloader_config=LocalDownloaderConfig,
199
+ connection_config=LocalConnectionConfig,
200
+ )
201
+
202
+ local_destination_entry = DestinationRegistryEntry(
203
+ uploader=LocalUploader, uploader_config=LocalUploaderConfig
204
+ )
@@ -0,0 +1,138 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Optional
5
+
6
+ from unstructured.__version__ import __version__ as unstructured_version
7
+
8
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.utils.data_prep import batch_generator
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ FileData,
15
+ UploadContent,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from pymongo import MongoClient
28
+
29
+ CONNECTOR_TYPE = "mongodb"
30
+ SERVER_API_VERSION = "1"
31
+
32
+
33
+ @dataclass
34
+ class MongoDBAccessConfig(AccessConfig):
35
+ uri: Optional[str] = None
36
+
37
+
38
+ @dataclass
39
+ class MongoDBConnectionConfig(ConnectionConfig):
40
+ access_config: MongoDBAccessConfig = enhanced_field(
41
+ sensitive=True, default_factory=MongoDBAccessConfig
42
+ )
43
+ host: Optional[str] = None
44
+ database: Optional[str] = None
45
+ collection: Optional[str] = None
46
+ port: int = 27017
47
+ batch_size: int = 100
48
+ connector_type: str = CONNECTOR_TYPE
49
+
50
+
51
+ @dataclass
52
+ class MongoDBUploadStagerConfig(UploadStagerConfig):
53
+ pass
54
+
55
+
56
+ @dataclass
57
+ class MongoDBUploadStager(UploadStager):
58
+ upload_stager_config: MongoDBUploadStagerConfig = field(
59
+ default_factory=lambda: MongoDBUploadStagerConfig()
60
+ )
61
+
62
+ def run(
63
+ self,
64
+ elements_filepath: Path,
65
+ file_data: FileData,
66
+ output_dir: Path,
67
+ output_filename: str,
68
+ **kwargs: Any,
69
+ ) -> Path:
70
+ with open(elements_filepath) as elements_file:
71
+ elements_contents = json.load(elements_file)
72
+
73
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
74
+ with open(output_path, "w") as output_file:
75
+ json.dump(elements_contents, output_file)
76
+ return output_path
77
+
78
+
79
+ @dataclass
80
+ class MongoDBUploaderConfig(UploaderConfig):
81
+ batch_size: int = 100
82
+
83
+
84
+ @dataclass
85
+ class MongoDBUploader(Uploader):
86
+ upload_config: MongoDBUploaderConfig
87
+ connection_config: MongoDBConnectionConfig
88
+ client: Optional["MongoClient"] = field(init=False)
89
+ connector_type: str = CONNECTOR_TYPE
90
+
91
+ def __post_init__(self):
92
+ self.client = self.create_client()
93
+
94
+ @requires_dependencies(["pymongo"], extras="mongodb")
95
+ def create_client(self) -> "MongoClient":
96
+ from pymongo import MongoClient
97
+ from pymongo.driver_info import DriverInfo
98
+ from pymongo.server_api import ServerApi
99
+
100
+ if self.connection_config.access_config.uri:
101
+ return MongoClient(
102
+ self.connection_config.access_config.uri,
103
+ server_api=ServerApi(version=SERVER_API_VERSION),
104
+ driver=DriverInfo(name="unstructured", version=unstructured_version),
105
+ )
106
+ else:
107
+ return MongoClient(
108
+ host=self.connection_config.host,
109
+ port=self.connection_config.port,
110
+ server_api=ServerApi(version=SERVER_API_VERSION),
111
+ )
112
+
113
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
114
+ elements_dict = []
115
+ for content in contents:
116
+ with open(content.path) as elements_file:
117
+ elements = json.load(elements_file)
118
+ elements_dict.extend(elements)
119
+
120
+ logger.info(
121
+ f"writing {len(elements_dict)} objects to destination "
122
+ f"db, {self.connection_config.database}, "
123
+ f"collection {self.connection_config.collection} "
124
+ f"at {self.connection_config.host}",
125
+ )
126
+ db = self.client[self.connection_config.database]
127
+ collection = db[self.connection_config.collection]
128
+ for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
129
+ collection.insert_many(chunk)
130
+
131
+
132
+ mongodb_destination_entry = DestinationRegistryEntry(
133
+ connection_config=MongoDBConnectionConfig,
134
+ uploader=MongoDBUploader,
135
+ uploader_config=MongoDBUploaderConfig,
136
+ upload_stager=MongoDBUploadStager,
137
+ upload_stager_config=MongoDBUploadStagerConfig,
138
+ )