unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,152 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Optional
5
+
6
+ from unstructured import __name__ as integration_name
7
+ from unstructured.__version__ import __version__ as integration_version
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.utils.data_prep import batch_generator
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ FileData,
16
+ UploadContent,
17
+ Uploader,
18
+ UploaderConfig,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ )
22
+ from unstructured_ingest.v2.logger import logger
23
+ from unstructured_ingest.v2.processes.connector_registry import (
24
+ DestinationRegistryEntry,
25
+ )
26
+
27
+ if TYPE_CHECKING:
28
+ from astrapy.db import AstraDBCollection
29
+
30
+ CONNECTOR_TYPE = "astra"
31
+
32
+
33
+ @dataclass
34
+ class AstraAccessConfig(AccessConfig):
35
+ token: str
36
+ api_endpoint: str
37
+
38
+
39
+ @dataclass
40
+ class AstraConnectionConfig(ConnectionConfig):
41
+ connection_type: str = CONNECTOR_TYPE
42
+ access_config: AstraAccessConfig = enhanced_field(sensitive=True)
43
+
44
+
45
+ @dataclass
46
+ class AstraUploadStagerConfig(UploadStagerConfig):
47
+ pass
48
+
49
+
50
+ @dataclass
51
+ class AstraUploadStager(UploadStager):
52
+ upload_stager_config: AstraUploadStagerConfig = field(
53
+ default_factory=lambda: AstraUploadStagerConfig()
54
+ )
55
+
56
+ def conform_dict(self, element_dict: dict) -> dict:
57
+ return {
58
+ "$vector": element_dict.pop("embeddings", None),
59
+ "content": element_dict.pop("text", None),
60
+ "metadata": element_dict,
61
+ }
62
+
63
+ def run(
64
+ self,
65
+ elements_filepath: Path,
66
+ file_data: FileData,
67
+ output_dir: Path,
68
+ output_filename: str,
69
+ **kwargs: Any,
70
+ ) -> Path:
71
+ with open(elements_filepath) as elements_file:
72
+ elements_contents = json.load(elements_file)
73
+ conformed_elements = []
74
+ for element in elements_contents:
75
+ conformed_elements.append(self.conform_dict(element_dict=element))
76
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
77
+ with open(output_path, "w") as output_file:
78
+ json.dump(conformed_elements, output_file)
79
+ return output_path
80
+
81
+
82
+ @dataclass
83
+ class AstraUploaderConfig(UploaderConfig):
84
+ collection_name: str
85
+ embedding_dimension: int
86
+ namespace: Optional[str] = None
87
+ requested_indexing_policy: Optional[dict[str, Any]] = None
88
+ batch_size: int = 20
89
+
90
+
91
+ @dataclass
92
+ class AstraUploader(Uploader):
93
+ connection_config: AstraConnectionConfig
94
+ upload_config: AstraUploaderConfig
95
+ connector_type: str = CONNECTOR_TYPE
96
+
97
+ @requires_dependencies(["astrapy"], extras="astra")
98
+ def get_collection(self) -> "AstraDBCollection":
99
+ from astrapy.db import AstraDB
100
+
101
+ # Get the collection_name and embedding dimension
102
+ collection_name = self.upload_config.collection_name
103
+ embedding_dimension = self.upload_config.embedding_dimension
104
+ requested_indexing_policy = self.upload_config.requested_indexing_policy
105
+
106
+ # If the user has requested an indexing policy, pass it to the AstraDB
107
+ options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
108
+
109
+ # Build the Astra DB object.
110
+ # caller_name/version for AstraDB tracking
111
+ astra_db = AstraDB(
112
+ api_endpoint=self.connection_config.access_config.api_endpoint,
113
+ token=self.connection_config.access_config.token,
114
+ namespace=self.upload_config.namespace,
115
+ caller_name=integration_name,
116
+ caller_version=integration_version,
117
+ )
118
+
119
+ # Create and connect to the newly created collection
120
+ astra_db_collection = astra_db.create_collection(
121
+ collection_name=collection_name,
122
+ dimension=embedding_dimension,
123
+ options=options,
124
+ )
125
+ return astra_db_collection
126
+
127
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
128
+ elements_dict = []
129
+ for content in contents:
130
+ with open(content.path) as elements_file:
131
+ elements = json.load(elements_file)
132
+ elements_dict.extend(elements)
133
+
134
+ logger.info(
135
+ f"writing {len(elements_dict)} objects to destination "
136
+ f"collection {self.upload_config.collection_name}"
137
+ )
138
+
139
+ astra_batch_size = self.upload_config.batch_size
140
+ collection = self.get_collection()
141
+
142
+ for chunk in batch_generator(elements_dict, astra_batch_size):
143
+ collection.insert_many(chunk)
144
+
145
+
146
+ astra_destination_entry = DestinationRegistryEntry(
147
+ connection_config=AstraConnectionConfig,
148
+ upload_stager_config=AstraUploadStagerConfig,
149
+ upload_stager=AstraUploadStager,
150
+ uploader_config=AstraUploaderConfig,
151
+ uploader=AstraUploader,
152
+ )
@@ -0,0 +1,211 @@
1
+ import json
2
+ import typing as t
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+
7
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
8
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
9
+ from unstructured_ingest.utils.data_prep import batch_generator
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ UploadContent,
15
+ Uploader,
16
+ UploaderConfig,
17
+ UploadStager,
18
+ UploadStagerConfig,
19
+ )
20
+ from unstructured_ingest.v2.logger import logger
21
+ from unstructured_ingest.v2.processes.connector_registry import (
22
+ DestinationRegistryEntry,
23
+ add_destination_entry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
+
27
+ if t.TYPE_CHECKING:
28
+ from azure.search.documents import SearchClient
29
+
30
+
31
+ CONNECTOR_TYPE = "azure_cognitive_search"
32
+
33
+
34
+ @dataclass
35
+ class AzureCognitiveSearchAccessConfig(AccessConfig):
36
+ key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key")
37
+
38
+
39
+ @dataclass
40
+ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
+ endpoint: str
42
+ index: str
43
+ access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True)
44
+
45
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
46
+ def generate_client(self) -> "SearchClient":
47
+ from azure.core.credentials import AzureKeyCredential
48
+ from azure.search.documents import SearchClient
49
+
50
+ return SearchClient(
51
+ endpoint=self.endpoint,
52
+ index_name=self.index,
53
+ credential=AzureKeyCredential(self.access_config.key),
54
+ )
55
+
56
+
57
+ @dataclass
58
+ class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class AzureCognitiveSearchUploaderConfig(UploaderConfig):
64
+ batch_size: int = 100
65
+
66
+
67
+ @dataclass
68
+ class AzureCognitiveSearchUploadStager(UploadStager):
69
+ upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
70
+ default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
71
+ )
72
+
73
+ @staticmethod
74
+ def conform_dict(data: dict) -> dict:
75
+ """
76
+ updates the dictionary that is from each Element being converted into a dict/json
77
+ into a dictionary that conforms to the schema expected by the
78
+ Azure Cognitive Search index
79
+ """
80
+
81
+ data["id"] = str(uuid.uuid4())
82
+
83
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
84
+ data["metadata"]["coordinates"]["points"] = json.dumps(points)
85
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
86
+ data["metadata"]["data_source"]["version"] = str(version)
87
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
88
+ data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
89
+ if permissions_data := (
90
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
91
+ ):
92
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
93
+ if links := data.get("metadata", {}).get("links"):
94
+ data["metadata"]["links"] = [json.dumps(link) for link in links]
95
+ if last_modified := data.get("metadata", {}).get("last_modified"):
96
+ data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
97
+ "%Y-%m-%dT%H:%M:%S.%fZ"
98
+ )
99
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
100
+ data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
101
+ "%Y-%m-%dT%H:%M:%S.%fZ"
102
+ )
103
+
104
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
105
+ data["metadata"]["data_source"]["date_modified"] = parse_datetime(
106
+ date_modified
107
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
108
+
109
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
110
+ data["metadata"]["data_source"]["date_processed"] = parse_datetime(
111
+ date_processed
112
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
113
+
114
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
115
+ data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
116
+ if page_number := data.get("metadata", {}).get("page_number"):
117
+ data["metadata"]["page_number"] = str(page_number)
118
+ return data
119
+
120
+ def run(
121
+ self,
122
+ elements_filepath: Path,
123
+ output_dir: Path,
124
+ output_filename: str,
125
+ **kwargs: t.Any,
126
+ ) -> Path:
127
+ with open(elements_filepath) as elements_file:
128
+ elements_contents = json.load(elements_file)
129
+
130
+ conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
131
+
132
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
133
+ with open(output_path, "w") as output_file:
134
+ json.dump(conformed_elements, output_file)
135
+ return output_path
136
+
137
+
138
+ @dataclass
139
+ class AzureCognitiveSearchUploader(Uploader):
140
+ upload_config: AzureCognitiveSearchUploaderConfig
141
+ connection_config: AzureCognitiveSearchConnectionConfig
142
+ connector_type: str = CONNECTOR_TYPE
143
+
144
+ @DestinationConnectionError.wrap
145
+ @requires_dependencies(["azure"], extras="azure-cognitive-search")
146
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
147
+ import azure.core.exceptions
148
+
149
+ logger.info(
150
+ f"writing {len(elements_dict)} documents to destination "
151
+ f"index at {self.connection_config.index}",
152
+ )
153
+ try:
154
+ results = self.connection_config.generate_client().upload_documents(
155
+ documents=elements_dict
156
+ )
157
+
158
+ except azure.core.exceptions.HttpResponseError as http_error:
159
+ raise WriteError(f"http error: {http_error}") from http_error
160
+ errors = []
161
+ success = []
162
+ for result in results:
163
+ if result.succeeded:
164
+ success.append(result)
165
+ else:
166
+ errors.append(result)
167
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
168
+ if errors:
169
+ raise WriteError(
170
+ ", ".join(
171
+ [
172
+ f"{error.key}: [{error.status_code}] {error.error_message}"
173
+ for error in errors
174
+ ],
175
+ ),
176
+ )
177
+
178
+ def write_dict_wrapper(self, elements_dict):
179
+ return self.write_dict(elements_dict=elements_dict)
180
+
181
+ def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
182
+
183
+ elements_dict = []
184
+ for content in contents:
185
+ with open(content.path) as elements_file:
186
+ elements = json.load(elements_file)
187
+ elements_dict.extend(elements)
188
+
189
+ logger.info(
190
+ f"writing document batches to destination"
191
+ f" endpoint at {str(self.connection_config.endpoint)}"
192
+ f" index at {str(self.connection_config.index)}"
193
+ f" with batch size {str(self.upload_config.batch_size)}"
194
+ )
195
+
196
+ batch_size = self.upload_config.batch_size
197
+
198
+ for chunk in batch_generator(elements_dict, batch_size):
199
+ self.write_dict(elements_dict=chunk) # noqa: E203
200
+
201
+
202
+ add_destination_entry(
203
+ destination_type=CONNECTOR_TYPE,
204
+ entry=DestinationRegistryEntry(
205
+ connection_config=AzureCognitiveSearchConnectionConfig,
206
+ uploader=AzureCognitiveSearchUploader,
207
+ uploader_config=AzureCognitiveSearchUploaderConfig,
208
+ upload_stager=AzureCognitiveSearchUploadStager,
209
+ upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
210
+ ),
211
+ )
@@ -0,0 +1,204 @@
1
+ import json
2
+ import uuid
3
+ from dataclasses import dataclass, field
4
+ from datetime import date, datetime
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Dict, Optional
7
+
8
+ from dateutil import parser
9
+
10
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ UploadContent,
19
+ Uploader,
20
+ UploaderConfig,
21
+ UploadStager,
22
+ UploadStagerConfig,
23
+ )
24
+ from unstructured_ingest.v2.logger import logger
25
+ from unstructured_ingest.v2.processes.connector_registry import (
26
+ DestinationRegistryEntry,
27
+ )
28
+
29
+ if TYPE_CHECKING:
30
+ from chromadb import Client
31
+
32
+ CONNECTOR_TYPE = "chroma"
33
+
34
+
35
+ @dataclass
36
+ class ChromaAccessConfig(AccessConfig):
37
+ settings: Optional[Dict[str, str]] = None
38
+ headers: Optional[Dict[str, str]] = None
39
+
40
+
41
+ @dataclass
42
+ class ChromaConnectionConfig(ConnectionConfig):
43
+ collection_name: str
44
+ access_config: ChromaAccessConfig = enhanced_field(sensitive=True)
45
+ path: Optional[str] = None
46
+ tenant: Optional[str] = "default_tenant"
47
+ database: Optional[str] = "default_database"
48
+ host: Optional[str] = None
49
+ port: Optional[int] = None
50
+ ssl: bool = False
51
+ connector_type: str = CONNECTOR_TYPE
52
+
53
+
54
+ @dataclass
55
+ class ChromaUploadStagerConfig(UploadStagerConfig):
56
+ pass
57
+
58
+
59
+ @dataclass
60
+ class ChromaUploadStager(UploadStager):
61
+ upload_stager_config: ChromaUploadStagerConfig = field(
62
+ default_factory=lambda: ChromaUploadStagerConfig()
63
+ )
64
+
65
+ @staticmethod
66
+ def parse_date_string(date_string: str) -> date:
67
+ try:
68
+ timestamp = float(date_string)
69
+ return datetime.fromtimestamp(timestamp)
70
+ except Exception as e:
71
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
72
+ return parser.parse(date_string)
73
+
74
+ @staticmethod
75
+ def conform_dict(data: dict) -> dict:
76
+ """
77
+ Prepares dictionary in the format that Chroma requires
78
+ """
79
+ element_id = data.get("element_id", str(uuid.uuid4()))
80
+ return {
81
+ "id": element_id,
82
+ "embedding": data.pop("embeddings", None),
83
+ "document": data.pop("text", None),
84
+ "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
85
+ }
86
+
87
+ def run(
88
+ self,
89
+ elements_filepath: Path,
90
+ file_data: FileData,
91
+ output_dir: Path,
92
+ output_filename: str,
93
+ **kwargs: Any,
94
+ ) -> Path:
95
+ with open(elements_filepath) as elements_file:
96
+ elements_contents = json.load(elements_file)
97
+ conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
98
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
99
+ with open(output_path, "w") as output_file:
100
+ json.dump(conformed_elements, output_file)
101
+ return output_path
102
+
103
+
104
+ @dataclass
105
+ class ChromaUploaderConfig(UploaderConfig):
106
+ batch_size: int = 100
107
+
108
+
109
+ @dataclass
110
+ class ChromaUploader(Uploader):
111
+ connector_type: str = CONNECTOR_TYPE
112
+ upload_config: ChromaUploaderConfig
113
+ connection_config: ChromaConnectionConfig
114
+ client: Optional["Client"] = field(init=False)
115
+
116
+ def __post_init__(self):
117
+ self.client = self.create_client()
118
+
119
+ @requires_dependencies(["chromadb"], extras="chroma")
120
+ def create_client(self) -> "Client":
121
+ import chromadb
122
+
123
+ if self.connection_config.path:
124
+ return chromadb.PersistentClient(
125
+ path=self.connection_config.path,
126
+ settings=self.connection_config.access_config.settings,
127
+ tenant=self.connection_config.tenant,
128
+ database=self.connection_config.database,
129
+ )
130
+
131
+ elif self.connection_config.host and self.connection_config.port:
132
+ return chromadb.HttpClient(
133
+ host=self.connection_config.host,
134
+ port=self.connection_config.port,
135
+ ssl=self.connection_config.ssl,
136
+ headers=self.connection_config.access_config.headers,
137
+ settings=self.connection_config.access_config.settings,
138
+ tenant=self.connection_config.tenant,
139
+ database=self.connection_config.database,
140
+ )
141
+ else:
142
+ raise ValueError("Chroma connector requires either path or host and port to be set.")
143
+
144
+ @DestinationConnectionError.wrap
145
+ def upsert_batch(self, collection, batch):
146
+
147
+ try:
148
+ # Chroma wants lists even if there is only one element
149
+ # Upserting to prevent duplicates
150
+ collection.upsert(
151
+ ids=batch["ids"],
152
+ documents=batch["documents"],
153
+ embeddings=batch["embeddings"],
154
+ metadatas=batch["metadatas"],
155
+ )
156
+ except Exception as e:
157
+ raise ValueError(f"chroma error: {e}") from e
158
+
159
+ @staticmethod
160
+ def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]:
161
+ """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
162
+ ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
163
+ chroma_dict = {}
164
+ chroma_dict["ids"] = [x.get("id") for x in chunk]
165
+ chroma_dict["documents"] = [x.get("document") for x in chunk]
166
+ chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
167
+ chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
168
+ # Make sure all lists are of the same length
169
+ assert (
170
+ len(chroma_dict["ids"])
171
+ == len(chroma_dict["documents"])
172
+ == len(chroma_dict["embeddings"])
173
+ == len(chroma_dict["metadatas"])
174
+ )
175
+ return chroma_dict
176
+
177
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
178
+
179
+ elements_dict = []
180
+ for content in contents:
181
+ with open(content.path) as elements_file:
182
+ elements = json.load(elements_file)
183
+ elements_dict.extend(elements)
184
+
185
+ logger.info(
186
+ f"writing {len(elements_dict)} objects to destination "
187
+ f"collection {self.connection_config.collection_name} "
188
+ f"at {self.connection_config.host}",
189
+ )
190
+
191
+ collection = self.client.get_or_create_collection(
192
+ name=self.connection_config.collection_name
193
+ )
194
+ for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
195
+ self.upsert_batch(collection, self.prepare_chroma_list(chunk))
196
+
197
+
198
+ chroma_destination_entry = DestinationRegistryEntry(
199
+ connection_config=ChromaConnectionConfig,
200
+ uploader=ChromaUploader,
201
+ uploader_config=ChromaUploaderConfig,
202
+ upload_stager=ChromaUploadStager,
203
+ upload_stager_config=ChromaUploadStagerConfig,
204
+ )
@@ -0,0 +1,96 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
5
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.utils.dep_check import requires_dependencies
7
+ from unstructured_ingest.v2.interfaces import (
8
+ AccessConfig,
9
+ ConnectionConfig,
10
+ UploadContent,
11
+ Uploader,
12
+ UploaderConfig,
13
+ )
14
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
15
+
16
+ if TYPE_CHECKING:
17
+ from databricks.sdk import WorkspaceClient
18
+
19
+ CONNECTOR_TYPE = "databricks_volumes"
20
+
21
+
22
+ @dataclass
23
+ class DatabricksVolumesAccessConfig(AccessConfig):
24
+ account_id: Optional[str] = None
25
+ username: Optional[str] = None
26
+ password: Optional[str] = None
27
+ client_id: Optional[str] = None
28
+ client_secret: Optional[str] = None
29
+ token: Optional[str] = None
30
+ profile: Optional[str] = None
31
+ azure_workspace_resource_id: Optional[str] = None
32
+ azure_client_secret: Optional[str] = None
33
+ azure_client_id: Optional[str] = None
34
+ azure_tenant_id: Optional[str] = None
35
+ azure_environment: Optional[str] = None
36
+ auth_type: Optional[str] = None
37
+ cluster_id: Optional[str] = None
38
+ google_credentials: Optional[str] = None
39
+ google_service_account: Optional[str] = None
40
+
41
+
42
+ @dataclass
43
+ class DatabricksVolumesConnectionConfig(ConnectionConfig):
44
+ access_config: DatabricksVolumesAccessConfig = enhanced_field(
45
+ default_factory=DatabricksVolumesAccessConfig, sensitive=True
46
+ )
47
+ host: Optional[str] = None
48
+
49
+
50
+ @dataclass
51
+ class DatabricksVolumesUploaderConfig(UploaderConfig):
52
+ volume: str
53
+ catalog: str
54
+ volume_path: Optional[str] = None
55
+ overwrite: bool = False
56
+ schema: str = "default"
57
+
58
+ @property
59
+ def path(self) -> str:
60
+ path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
61
+ if self.volume_path:
62
+ path = f"{path}/{self.volume_path}"
63
+ return path
64
+
65
+
66
+ @dataclass
67
+ class DatabricksVolumesUploader(Uploader):
68
+ connector_type: str = CONNECTOR_TYPE
69
+ upload_config: DatabricksVolumesUploaderConfig
70
+ connection_config: DatabricksVolumesConnectionConfig
71
+ client: Optional["WorkspaceClient"] = field(init=False, default=None)
72
+
73
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
74
+ def __post_init__(self) -> "WorkspaceClient":
75
+ from databricks.sdk import WorkspaceClient
76
+
77
+ self.client = WorkspaceClient(
78
+ host=self.connection_config.host, **self.connection_config.access_config.to_dict()
79
+ )
80
+
81
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
82
+ for content in contents:
83
+ with open(content.path, "rb") as elements_file:
84
+ output_path = os.path.join(self.upload_config.path, content.path.name)
85
+ self.client.files.upload(
86
+ file_path=output_path,
87
+ contents=elements_file,
88
+ overwrite=self.upload_config.overwrite,
89
+ )
90
+
91
+
92
+ databricks_volumes_destination_entry = DestinationRegistryEntry(
93
+ connection_config=DatabricksVolumesConnectionConfig,
94
+ uploader=DatabricksVolumesUploader,
95
+ uploader_config=DatabricksVolumesUploaderConfig,
96
+ )