unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,158 @@
1
+ import copy
2
+ import typing as t
3
+ import uuid
4
+ from dataclasses import dataclass
5
+
6
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
7
+ from unstructured_ingest.error import DestinationConnectionError
8
+ from unstructured_ingest.interfaces import (
9
+ AccessConfig,
10
+ BaseConnectorConfig,
11
+ BaseDestinationConnector,
12
+ WriteConfig,
13
+ )
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+ if t.TYPE_CHECKING:
19
+ from chromadb import Collection as ChromaCollection
20
+
21
+
22
+ @dataclass
23
+ class ChromaAccessConfig(AccessConfig):
24
+ settings: t.Optional[t.Dict[str, str]] = None
25
+ headers: t.Optional[t.Dict[str, str]] = None
26
+
27
+
28
+ @dataclass
29
+ class SimpleChromaConfig(BaseConnectorConfig):
30
+ access_config: ChromaAccessConfig
31
+ collection_name: str
32
+ path: t.Optional[str] = None
33
+ tenant: t.Optional[str] = "default_tenant"
34
+ database: t.Optional[str] = "default_database"
35
+ host: t.Optional[str] = None
36
+ port: t.Optional[int] = None
37
+ ssl: bool = False
38
+
39
+
40
+ @dataclass
41
+ class ChromaWriteConfig(WriteConfig):
42
+ batch_size: int = 100
43
+
44
+
45
+ @dataclass
46
+ class ChromaDestinationConnector(BaseDestinationConnector):
47
+ write_config: ChromaWriteConfig
48
+ connector_config: SimpleChromaConfig
49
+ _collection: t.Optional["ChromaCollection"] = None
50
+
51
+ @property
52
+ def chroma_collection(self):
53
+ if self._collection is None:
54
+ self._collection = self.create_collection()
55
+ return self._collection
56
+
57
+ def initialize(self):
58
+ pass
59
+
60
+ @DestinationConnectionError.wrap
61
+ def check_connection(self):
62
+ _ = self.chroma_collection
63
+
64
+ def to_dict(self, **kwargs):
65
+ """
66
+ The _collection variable in this dataclass breaks deepcopy due to:
67
+ TypeError: cannot pickle 'module' object
68
+ When serializing, remove it, meaning collection data will need to be reinitialized
69
+ when deserialized
70
+ """
71
+ self_cp = copy.copy(self)
72
+ if hasattr(self_cp, "_collection"):
73
+ setattr(self_cp, "_collection", None)
74
+ return _asdict(self_cp, **kwargs)
75
+
76
+ @requires_dependencies(["chromadb"], extras="chroma")
77
+ def create_collection(self) -> "ChromaCollection":
78
+ import chromadb
79
+
80
+ if self.connector_config.path:
81
+ chroma_client = chromadb.PersistentClient(
82
+ path=self.connector_config.path,
83
+ settings=self.connector_config.settings,
84
+ tenant=self.connector_config.tenant,
85
+ database=self.connector_config.database,
86
+ )
87
+
88
+ elif self.connector_config.host and self.connector_config.port:
89
+ chroma_client = chromadb.HttpClient(
90
+ host=self.connector_config.host,
91
+ port=self.connector_config.port,
92
+ ssl=self.connector_config.ssl,
93
+ headers=self.connector_config.access_config.headers,
94
+ settings=self.connector_config.access_config.settings,
95
+ tenant=self.connector_config.tenant,
96
+ database=self.connector_config.database,
97
+ )
98
+ else:
99
+ raise ValueError("Chroma connector requires either path or host and port to be set.")
100
+
101
+ collection = chroma_client.get_or_create_collection(
102
+ name=self.connector_config.collection_name
103
+ )
104
+ return collection
105
+
106
+ @DestinationConnectionError.wrap
107
+ @requires_dependencies(["chromadb"], extras="chroma")
108
+ def upsert_batch(self, batch):
109
+ collection = self.chroma_collection
110
+
111
+ try:
112
+ # Chroma wants lists even if there is only one element
113
+ # Upserting to prevent duplicates
114
+ collection.upsert(
115
+ ids=batch["ids"],
116
+ documents=batch["documents"],
117
+ embeddings=batch["embeddings"],
118
+ metadatas=batch["metadatas"],
119
+ )
120
+ except Exception as e:
121
+ raise ValueError(f"chroma error: {e}") from e
122
+
123
+ @staticmethod
124
+ def prepare_chroma_list(chunk: t.Tuple[t.Dict[str, t.Any]]) -> t.Dict[str, t.List[t.Any]]:
125
+ """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
126
+ ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
127
+ chroma_dict = {}
128
+ chroma_dict["ids"] = [x.get("id") for x in chunk]
129
+ chroma_dict["documents"] = [x.get("document") for x in chunk]
130
+ chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
131
+ chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
132
+ # Make sure all lists are of the same length
133
+ assert (
134
+ len(chroma_dict["ids"])
135
+ == len(chroma_dict["documents"])
136
+ == len(chroma_dict["embeddings"])
137
+ == len(chroma_dict["metadatas"])
138
+ )
139
+ return chroma_dict
140
+
141
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
142
+ logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ")
143
+
144
+ chroma_batch_size = self.write_config.batch_size
145
+
146
+ for chunk in batch_generator(elements_dict, chroma_batch_size):
147
+ self.upsert_batch(self.prepare_chroma_list(chunk))
148
+
149
+ def normalize_dict(self, element_dict: dict) -> dict:
150
+ element_id = element_dict.get("element_id", str(uuid.uuid4()))
151
+ return {
152
+ "id": element_id,
153
+ "embedding": element_dict.pop("embeddings", None),
154
+ "document": element_dict.pop("text", None),
155
+ "metadata": flatten_dict(
156
+ element_dict, separator="-", flatten_lists=True, remove_none=True
157
+ ),
158
+ }
@@ -0,0 +1,122 @@
1
+ import typing as t
2
+ import uuid
3
+ from dataclasses import dataclass, field
4
+
5
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.error import DestinationConnectionError
7
+ from unstructured_ingest.interfaces import (
8
+ AccessConfig,
9
+ BaseConnectorConfig,
10
+ BaseDestinationConnector,
11
+ WriteConfig,
12
+ )
13
+ from unstructured_ingest.logger import logger
14
+ from unstructured_ingest.utils.data_prep import flatten_dict
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+
17
+ if t.TYPE_CHECKING:
18
+ from clarifai.client.input import Inputs
19
+
20
+
21
+ @dataclass
22
+ class ClarifaiAccessConfig(AccessConfig):
23
+ api_key: str = enhanced_field(sensitive=True)
24
+
25
+
26
+ @dataclass
27
+ class SimpleClarifaiConfig(BaseConnectorConfig):
28
+ access_config: ClarifaiAccessConfig
29
+ app_id: str
30
+ user_id: str
31
+ dataset_id: t.Optional[str] = None
32
+
33
+
34
+ @dataclass
35
+ class ClarifaiWriteConfig(WriteConfig):
36
+ batch_size: int = 50
37
+
38
+
39
+ @dataclass
40
+ class ClarifaiDestinationConnector(BaseDestinationConnector):
41
+ write_config: ClarifaiWriteConfig
42
+ connector_config: SimpleClarifaiConfig
43
+ _client: t.Optional["Inputs"] = field(init=False, default=None)
44
+
45
+ @property
46
+ @requires_dependencies(["clarifai"], extras="clarifai")
47
+ def client(self) -> "Inputs":
48
+ if self._client is None:
49
+ from clarifai.client.input import Inputs
50
+
51
+ access_conf = self.connector_config.access_config
52
+ try:
53
+ if access_conf.api_key is not None:
54
+ clarifai_pat = access_conf.api_key
55
+ except Exception as e:
56
+ raise (f"please provide clarifai PAT key : {e}")
57
+
58
+ self._client = Inputs(
59
+ app_id=self.connector_config.app_id,
60
+ user_id=self.connector_config.user_id,
61
+ pat=clarifai_pat,
62
+ )
63
+ return self._client
64
+
65
+ @requires_dependencies(["clarifai"], extras="clarifai")
66
+ @DestinationConnectionError.wrap
67
+ def initialize(self):
68
+ _ = self.client
69
+
70
+ def check_connection(self):
71
+ try:
72
+ _ = [inp for inp in self.client.list_inputs(page_no=1, per_page=1)] # noqa: C416
73
+ except Exception as e:
74
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
75
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
76
+
77
+ def normalize_dict(self, element_dict: dict) -> dict:
78
+ """Modifying schema of the dict in order to compile with clarifai input formats"""
79
+ return {
80
+ "input_id": str(uuid.uuid4().hex),
81
+ "text": element_dict.pop("text", None),
82
+ "metadata": {
83
+ **flatten_dict(
84
+ element_dict,
85
+ separator="_",
86
+ flatten_lists=True,
87
+ remove_none=True,
88
+ ),
89
+ },
90
+ }
91
+
92
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
93
+ from google.protobuf.struct_pb2 import Struct
94
+
95
+ logger.info(
96
+ f"writing {len(elements_dict)} objects to destination "
97
+ f"app {self.connector_config.app_id} "
98
+ )
99
+ try:
100
+ batch_size = self.write_config.batch_size
101
+ for idx in range(0, len(elements_dict), batch_size):
102
+ batch_dict = elements_dict[idx : batch_size + idx]
103
+ input_batch = []
104
+ for elem in batch_dict:
105
+ meta_struct = Struct()
106
+ meta_struct.update(elem["metadata"])
107
+ input_batch.append(
108
+ self._client.get_text_input(
109
+ input_id=elem["input_id"],
110
+ raw_text=elem["text"],
111
+ dataset_id=self.connector_config.dataset_id,
112
+ metadata=meta_struct,
113
+ )
114
+ )
115
+ result_id = self._client.upload_inputs(inputs=input_batch)
116
+ logger.debug(
117
+ f"Input posted successfully into {self.connector_config.app_id}. \
118
+ Result id: {result_id}"
119
+ )
120
+
121
+ except Exception as e:
122
+ raise e
@@ -0,0 +1,285 @@
1
+ import math
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ import requests
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ BaseConnectorConfig,
14
+ BaseSingleIngestDoc,
15
+ BaseSourceConnector,
16
+ IngestDocCleanupMixin,
17
+ SourceConnectorCleanupMixin,
18
+ SourceMetadata,
19
+ )
20
+ from unstructured_ingest.logger import logger
21
+ from unstructured_ingest.utils.dep_check import requires_dependencies
22
+
23
+ if t.TYPE_CHECKING:
24
+ from atlassian import Confluence
25
+
26
+
27
+ @dataclass
28
+ class ConfluenceAccessConfig(AccessConfig):
29
+ api_token: str = enhanced_field(sensitive=True)
30
+
31
+
32
+ @dataclass
33
+ class SimpleConfluenceConfig(BaseConnectorConfig):
34
+ """Connector config where:
35
+ user_email is the email to authenticate into Confluence Cloud,
36
+ api_token is the api token to authenticate into Confluence Cloud,
37
+ and url is the URL pointing to the Confluence Cloud instance.
38
+
39
+ Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/
40
+ for more info on the api_token.
41
+ """
42
+
43
+ user_email: str
44
+ access_config: ConfluenceAccessConfig
45
+ url: str
46
+ max_num_of_spaces: int = 500
47
+ max_num_of_docs_from_each_space: int = 100
48
+ spaces: t.List[str] = field(default_factory=list)
49
+
50
+
51
+ @dataclass
52
+ class ConfluenceDocumentMeta:
53
+ """Metadata specifying:
54
+ id for the confluence space that the document locates in,
55
+ and the id of document that is being reached to.
56
+ """
57
+
58
+ space_id: str
59
+ document_id: str
60
+
61
+
62
+ def scroll_wrapper(func):
63
+ def wrapper(*args, **kwargs):
64
+ """Wraps a function to obtain scroll functionality."""
65
+ number_of_items_to_fetch = kwargs["number_of_items_to_fetch"]
66
+ del kwargs["number_of_items_to_fetch"]
67
+
68
+ kwargs["limit"] = min(100, number_of_items_to_fetch)
69
+ kwargs["start"] = kwargs.get("start", 0)
70
+
71
+ all_results = []
72
+ num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"])
73
+
74
+ for _ in range(num_iterations):
75
+ response = func(*args, **kwargs)
76
+ if isinstance(response, list):
77
+ all_results += func(*args, **kwargs)
78
+ elif isinstance(response, dict):
79
+ all_results += func(*args, **kwargs)["results"]
80
+
81
+ kwargs["start"] += kwargs["limit"]
82
+
83
+ return all_results[:number_of_items_to_fetch]
84
+
85
+ return wrapper
86
+
87
+
88
+ @dataclass
89
+ class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
90
+ """Class encapsulating fetching a doc and writing processed results (but not
91
+ doing the processing).
92
+
93
+ Current implementation creates a Confluence connection object
94
+ to fetch each doc, rather than creating a it for each thread.
95
+ """
96
+
97
+ connector_config: SimpleConfluenceConfig
98
+ document_meta: ConfluenceDocumentMeta
99
+ registry_name: str = "confluence"
100
+
101
+ # TODO: remove one of filename or _tmp_download_file, using a wrapper
102
+ @property
103
+ def filename(self):
104
+ if not self.read_config.download_dir:
105
+ return None
106
+ return (
107
+ Path(self.read_config.download_dir)
108
+ / self.document_meta.space_id
109
+ / f"{self.document_meta.document_id}.html"
110
+ ).resolve()
111
+
112
+ @property
113
+ def _output_filename(self):
114
+ """Create output file path based on output directory, space id and document id."""
115
+ output_file = f"{self.document_meta.document_id}.json"
116
+ return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file
117
+
118
+ @property
119
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
120
+ return {
121
+ "url": self.connector_config.url,
122
+ "page_id": self.document_meta.document_id,
123
+ }
124
+
125
+ @SourceConnectionNetworkError.wrap
126
+ @requires_dependencies(["atlassian"], extras="Confluence")
127
+ def _get_page(self):
128
+ from atlassian import Confluence
129
+ from atlassian.errors import ApiError
130
+
131
+ try:
132
+ confluence = Confluence(
133
+ self.connector_config.url,
134
+ username=self.connector_config.user_email,
135
+ password=self.connector_config.access_config.api_token,
136
+ )
137
+ result = confluence.get_page_by_id(
138
+ page_id=self.document_meta.document_id,
139
+ expand="history.lastUpdated,version,body.view",
140
+ )
141
+ except ApiError as e:
142
+ logger.error(e)
143
+ return None
144
+ return result
145
+
146
+ def update_source_metadata(self, **kwargs):
147
+ """Fetches file metadata from the current page."""
148
+ page = kwargs.get("page", self._get_page())
149
+ if page is None:
150
+ self.source_metadata = SourceMetadata(
151
+ exists=False,
152
+ )
153
+ return
154
+ document_history = page["history"]
155
+ date_created = datetime.strptime(
156
+ document_history["createdDate"],
157
+ "%Y-%m-%dT%H:%M:%S.%fZ",
158
+ ).isoformat()
159
+ if last_updated := document_history.get("lastUpdated", {}).get("when", ""):
160
+ date_modified = datetime.strptime(
161
+ last_updated,
162
+ "%Y-%m-%dT%H:%M:%S.%fZ",
163
+ ).isoformat()
164
+ else:
165
+ date_modified = date_created
166
+ version = page["version"]["number"]
167
+ self.source_metadata = SourceMetadata(
168
+ date_created=date_created,
169
+ date_modified=date_modified,
170
+ version=version,
171
+ source_url=page["_links"].get("self", None),
172
+ exists=True,
173
+ )
174
+
175
+ @SourceConnectionError.wrap
176
+ @requires_dependencies(["atlassian"], extras="confluence")
177
+ @BaseSingleIngestDoc.skip_if_file_exists
178
+ def get_file(self):
179
+ # TODO: instead of having a separate connection object for each doc,
180
+ # have a separate connection object for each process
181
+
182
+ result = self._get_page()
183
+ self.update_source_metadata(page=result)
184
+ if result is None:
185
+ raise ValueError(f"Failed to retrieve page with ID {self.document_meta.document_id}")
186
+ self.document = result["body"]["view"]["value"]
187
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
188
+ with open(self.filename, "w", encoding="utf8") as f:
189
+ f.write(self.document)
190
+
191
+
192
+ @dataclass
193
+ class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
194
+ """Fetches body fields from all documents within all spaces in a Confluence Cloud instance."""
195
+
196
+ connector_config: SimpleConfluenceConfig
197
+ _confluence: t.Optional["Confluence"] = field(init=False, default=None)
198
+
199
+ @property
200
+ def confluence(self) -> "Confluence":
201
+ from atlassian import Confluence
202
+
203
+ if self._confluence is None:
204
+ self._confluence = Confluence(
205
+ url=self.connector_config.url,
206
+ username=self.connector_config.user_email,
207
+ password=self.connector_config.access_config.api_token,
208
+ )
209
+ return self._confluence
210
+
211
+ @requires_dependencies(["atlassian"], extras="Confluence")
212
+ def check_connection(self):
213
+ url = "rest/api/space"
214
+ try:
215
+ self.confluence.request(method="HEAD", path=url)
216
+ except requests.HTTPError as http_error:
217
+ logger.error(f"failed to validate connection: {http_error}", exc_info=True)
218
+ raise SourceConnectionError(f"failed to validate connection: {http_error}")
219
+
220
+ @requires_dependencies(["atlassian"], extras="Confluence")
221
+ def initialize(self):
222
+ self.list_of_spaces = None
223
+ if self.connector_config.spaces:
224
+ self.list_of_spaces = self.connector_config.spaces
225
+ if self.connector_config.max_num_of_spaces:
226
+ logger.warning(
227
+ """--confluence-list-of-spaces and --confluence-num-of-spaces cannot
228
+ be used at the same time. Connector will only fetch the
229
+ --confluence-list-of-spaces that you've provided.""",
230
+ )
231
+
232
+ @requires_dependencies(["atlassian"], extras="Confluence")
233
+ def _get_space_ids(self):
234
+ """Fetches spaces in a confluence domain."""
235
+
236
+ get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces)
237
+
238
+ all_results = get_spaces_with_scroll(
239
+ number_of_items_to_fetch=self.connector_config.max_num_of_spaces,
240
+ )
241
+
242
+ space_ids = [space["key"] for space in all_results]
243
+ return space_ids
244
+
245
+ @requires_dependencies(["atlassian"], extras="Confluence")
246
+ def _get_docs_ids_within_one_space(
247
+ self,
248
+ space_id: str,
249
+ content_type: str = "page",
250
+ ):
251
+ get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space)
252
+ results = get_pages_with_scroll(
253
+ space=space_id,
254
+ number_of_items_to_fetch=self.connector_config.max_num_of_docs_from_each_space,
255
+ content_type=content_type,
256
+ )
257
+
258
+ doc_ids = [(space_id, doc["id"]) for doc in results]
259
+ return doc_ids
260
+
261
+ @requires_dependencies(["atlassian"], extras="Confluence")
262
+ def _get_doc_ids_within_spaces(self):
263
+ space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces
264
+
265
+ doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids]
266
+
267
+ doc_ids_flattened = [
268
+ (space_id, doc_id)
269
+ for doc_ids_space in doc_ids_all
270
+ for space_id, doc_id in doc_ids_space
271
+ ]
272
+ return doc_ids_flattened
273
+
274
+ def get_ingest_docs(self):
275
+ """Fetches all documents in a confluence space."""
276
+ doc_ids = self._get_doc_ids_within_spaces()
277
+ return [
278
+ ConfluenceIngestDoc(
279
+ connector_config=self.connector_config,
280
+ processor_config=self.processor_config,
281
+ read_config=self.read_config,
282
+ document_meta=ConfluenceDocumentMeta(space_id, doc_id),
283
+ )
284
+ for space_id, doc_id in doc_ids
285
+ ]
@@ -0,0 +1,137 @@
1
+ import copy
2
+ import json
3
+ import os
4
+ import typing as t
5
+ from dataclasses import dataclass, field
6
+ from io import BytesIO
7
+ from pathlib import PurePath
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.interfaces import (
13
+ AccessConfig,
14
+ BaseConnectorConfig,
15
+ BaseDestinationConnector,
16
+ BaseSingleIngestDoc,
17
+ WriteConfig,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
+
22
+ if t.TYPE_CHECKING:
23
+ from databricks.sdk import WorkspaceClient
24
+
25
+
26
+ @dataclass
27
+ class DatabricksVolumesAccessConfig(AccessConfig):
28
+ account_id: t.Optional[str] = None
29
+ username: t.Optional[str] = None
30
+ password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
31
+ client_id: t.Optional[str] = None
32
+ client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
33
+ token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
34
+ profile: t.Optional[str] = None
35
+ azure_workspace_resource_id: t.Optional[str] = None
36
+ azure_client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
37
+ azure_client_id: t.Optional[str] = None
38
+ azure_tenant_id: t.Optional[str] = None
39
+ azure_environment: t.Optional[str] = None
40
+ auth_type: t.Optional[str] = None
41
+ cluster_id: t.Optional[str] = None
42
+ google_credentials: t.Optional[str] = None
43
+ google_service_account: t.Optional[str] = None
44
+
45
+
46
+ @dataclass
47
+ class SimpleDatabricksVolumesConfig(BaseConnectorConfig):
48
+ access_config: DatabricksVolumesAccessConfig
49
+ host: t.Optional[str] = None
50
+
51
+
52
+ @dataclass
53
+ class DatabricksVolumesWriteConfig(WriteConfig):
54
+ volume: str
55
+ catalog: str
56
+ volume_path: t.Optional[str] = None
57
+ overwrite: bool = False
58
+ encoding: str = "utf-8"
59
+ schema: str = "default"
60
+
61
+ @property
62
+ def path(self) -> str:
63
+ path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}"
64
+ if self.volume_path:
65
+ path = f"{path}/{self.volume_path}"
66
+ return path
67
+
68
+
69
+ @dataclass
70
+ class DatabricksVolumesDestinationConnector(BaseDestinationConnector):
71
+ write_config: DatabricksVolumesWriteConfig
72
+ connector_config: SimpleDatabricksVolumesConfig
73
+ _client: t.Optional["WorkspaceClient"] = field(init=False, default=None)
74
+
75
+ def to_dict(self, **kwargs):
76
+ self_cp = copy.copy(self)
77
+ if hasattr(self_cp, "_client"):
78
+ setattr(self_cp, "_client", None)
79
+ return _asdict(self_cp, **kwargs)
80
+
81
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
82
+ def generate_client(self) -> "WorkspaceClient":
83
+ from databricks.sdk import WorkspaceClient
84
+
85
+ return WorkspaceClient(
86
+ host=self.connector_config.host, **self.connector_config.access_config.to_dict()
87
+ )
88
+
89
+ @property
90
+ def client(self) -> "WorkspaceClient":
91
+ if self._client is None:
92
+ self._client = self.generate_client()
93
+ return self._client
94
+
95
+ def check_connection(self):
96
+ try:
97
+ assert self.client.current_user.me().active
98
+ except Exception as e:
99
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
100
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
101
+
102
+ def initialize(self):
103
+ _ = self.client
104
+
105
+ def write_dict(
106
+ self,
107
+ *args,
108
+ elements_dict: t.List[t.Dict[str, t.Any]],
109
+ filename: t.Optional[str] = None,
110
+ indent: int = 4,
111
+ encoding: str = "utf-8",
112
+ **kwargs,
113
+ ) -> None:
114
+ output_folder = self.write_config.path
115
+ output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator
116
+ filename = (
117
+ filename.strip(os.sep) if filename else filename
118
+ ) # Make sure filename doesn't begin with file seperator
119
+ output_path = str(PurePath(output_folder, filename)) if filename else output_folder
120
+ logger.debug(f"uploading content to {output_path}")
121
+ self.client.files.upload(
122
+ file_path=output_path,
123
+ contents=BytesIO(json.dumps(elements_dict).encode(encoding=self.write_config.encoding)),
124
+ overwrite=self.write_config.overwrite,
125
+ )
126
+
127
+ def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]:
128
+ pass
129
+
130
+ def write(self, docs: t.List[BaseSingleIngestDoc]) -> None:
131
+ for doc in docs:
132
+ file_path = doc.base_output_filename
133
+ filename = file_path if file_path else None
134
+ with open(doc._output_filename) as json_file:
135
+ logger.debug(f"uploading content from {doc._output_filename}")
136
+ json_list = json.load(json_file)
137
+ self.write_dict(elements_dict=json_list, filename=filename)