unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,269 @@
1
+ import enum
2
+ import json
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from datetime import date, datetime
6
+ from pathlib import Path
7
+ from typing import Any, Optional, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from dateutil import parser
12
+
13
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
14
+ from unstructured_ingest.utils.dep_check import requires_dependencies
15
+ from unstructured_ingest.v2.interfaces import (
16
+ AccessConfig,
17
+ ConnectionConfig,
18
+ FileData,
19
+ UploadContent,
20
+ Uploader,
21
+ UploaderConfig,
22
+ UploadStager,
23
+ UploadStagerConfig,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
27
+
28
+ CONNECTOR_TYPE = "sql"
29
+ ELEMENTS_TABLE_NAME = "elements"
30
+
31
+
32
+ @dataclass
33
+ class SQLAccessConfig(AccessConfig):
34
+ username: Optional[str] = None
35
+ password: Optional[str] = None
36
+
37
+
38
+ class DatabaseType(str, enum.Enum):
39
+ SQLITE = "sqlite"
40
+ POSTGRESQL = "postgresql"
41
+
42
+
43
+ @dataclass
44
+ class SimpleSqlConfig(ConnectionConfig):
45
+ db_type: DatabaseType = (
46
+ # required default value here because of parent class
47
+ DatabaseType.SQLITE
48
+ )
49
+ database: Optional[str] = None
50
+ host: Optional[str] = None
51
+ port: Optional[int] = 5432
52
+ access_config: Optional[SQLAccessConfig] = enhanced_field(default=None, sensitive=True)
53
+ connector_type: str = CONNECTOR_TYPE
54
+
55
+ def __post_init__(self):
56
+ if (self.db_type == DatabaseType.SQLITE) and (self.database is None):
57
+ raise ValueError(
58
+ "A sqlite connection requires a path to a *.db file "
59
+ "through the `database` argument"
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class SQLUploadStagerConfig(UploadStagerConfig):
65
+ pass
66
+
67
+
68
+ _COLUMNS = (
69
+ "id",
70
+ "element_id",
71
+ "text",
72
+ "embeddings",
73
+ "type",
74
+ "system",
75
+ "layout_width",
76
+ "layout_height",
77
+ "points",
78
+ "url",
79
+ "version",
80
+ "date_created",
81
+ "date_modified",
82
+ "date_processed",
83
+ "permissions_data",
84
+ "record_locator",
85
+ "category_depth",
86
+ "parent_id",
87
+ "attached_filename",
88
+ "filetype",
89
+ "last_modified",
90
+ "file_directory",
91
+ "filename",
92
+ "languages",
93
+ "page_number",
94
+ "links",
95
+ "page_name",
96
+ "link_urls",
97
+ "link_texts",
98
+ "sent_from",
99
+ "sent_to",
100
+ "subject",
101
+ "section",
102
+ "header_footer_type",
103
+ "emphasized_text_contents",
104
+ "emphasized_text_tags",
105
+ "text_as_html",
106
+ "regex_metadata",
107
+ "detection_class_prob",
108
+ )
109
+
110
+ _DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
111
+
112
+
113
+ def parse_date_string(date_value: Union[str, int]) -> date:
114
+ try:
115
+ timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value)
116
+ return datetime.fromtimestamp(timestamp)
117
+ except Exception as e:
118
+ logger.debug(f"date {date_value} string not a timestamp: {e}")
119
+ return parser.parse(date_value)
120
+
121
+
122
+ @dataclass
123
+ class SQLUploadStager(UploadStager):
124
+ upload_stager_config: SQLUploadStagerConfig = field(
125
+ default_factory=lambda: SQLUploadStagerConfig()
126
+ )
127
+
128
+ def run(
129
+ self,
130
+ elements_filepath: Path,
131
+ file_data: FileData,
132
+ output_dir: Path,
133
+ output_filename: str,
134
+ **kwargs: Any,
135
+ ) -> Path:
136
+ with open(elements_filepath) as elements_file:
137
+ elements_contents = json.load(elements_file)
138
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
139
+ output_path.parent.mkdir(parents=True, exist_ok=True)
140
+
141
+ output = []
142
+ for data in elements_contents:
143
+ metadata: dict[str, Any] = data.pop("metadata", {})
144
+ data_source = metadata.pop("data_source", {})
145
+ coordinates = metadata.pop("coordinates", {})
146
+
147
+ data.update(metadata)
148
+ data.update(data_source)
149
+ data.update(coordinates)
150
+
151
+ data["id"] = str(uuid.uuid4())
152
+
153
+ # remove extraneous, not supported columns
154
+ [data.pop(column) for column in data if column not in _COLUMNS]
155
+
156
+ output.append(data)
157
+
158
+ df = pd.DataFrame.from_dict(output)
159
+ for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
160
+ df[column] = df[column].apply(parse_date_string)
161
+ for column in filter(
162
+ lambda x: x in df.columns,
163
+ ("permissions_data", "record_locator", "points", "links"),
164
+ ):
165
+ df[column] = df[column].apply(
166
+ lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
167
+ )
168
+ for column in filter(
169
+ lambda x: x in df.columns,
170
+ ("version", "page_number", "regex_metadata"),
171
+ ):
172
+ df[column] = df[column].apply(str)
173
+
174
+ with output_path.open("w") as output_file:
175
+ df.to_json(output_file, orient="records", lines=True)
176
+ return output_path
177
+
178
+
179
+ @dataclass
180
+ class SQLUploaderConfig(UploaderConfig):
181
+ batch_size: int = 50
182
+
183
+
184
+ @dataclass
185
+ class SQLUploader(Uploader):
186
+ connector_type: str = CONNECTOR_TYPE
187
+ upload_config: SQLUploaderConfig
188
+ connection_config: SimpleSqlConfig
189
+
190
+ @property
191
+ def connection(self):
192
+ if self.connection_config.db_type == DatabaseType.POSTGRESQL:
193
+ return self._make_psycopg_connection
194
+ elif self.connection_config.db_type == DatabaseType.SQLITE:
195
+ return self._make_sqlite_connection
196
+ raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
197
+
198
+ def _make_sqlite_connection(self):
199
+ from sqlite3 import connect
200
+
201
+ return connect(database=self.connection_config.database)
202
+
203
+ @requires_dependencies(["psycopg2"], extras="postgres")
204
+ def _make_psycopg_connection(self):
205
+ from psycopg2 import connect
206
+
207
+ return connect(
208
+ user=self.connection_config.access_config.username,
209
+ password=self.connection_config.access_config.password,
210
+ dbname=self.connection_config.database,
211
+ host=self.connection_config.host,
212
+ port=self.connection_config.port,
213
+ )
214
+
215
+ def prepare_data(
216
+ self, columns: list[str], data: tuple[tuple[Any, ...], ...]
217
+ ) -> list[tuple[Any, ...]]:
218
+ output = []
219
+ for row in data:
220
+ parsed = []
221
+ for column_name, value in zip(columns, row):
222
+ if self.connection_config.db_type == DatabaseType.SQLITE and isinstance(
223
+ value, (list, dict)
224
+ ):
225
+ value = json.dumps(value)
226
+ if column_name in _DATE_COLUMNS:
227
+ if value is None:
228
+ parsed.append(None)
229
+ else:
230
+ parsed.append(parse_date_string(value))
231
+ else:
232
+ parsed.append(value)
233
+ output.append(tuple(parsed))
234
+ return output
235
+
236
+ def upload_contents(self, content: UploadContent) -> None:
237
+ df = pd.read_json(content.path, orient="records", lines=True)
238
+ logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
239
+ df.replace({np.nan: None}, inplace=True)
240
+
241
+ columns = tuple(df.columns)
242
+ stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
243
+ VALUES({','.join(['?' if self.connection_config.db_type==DatabaseType.SQLITE else '%s' for x in columns])})" # noqa E501
244
+
245
+ for rows in pd.read_json(
246
+ content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
247
+ ):
248
+ with self.connection() as conn:
249
+ values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
250
+ if self.connection_config.db_type == DatabaseType.SQLITE:
251
+ conn.executemany(stmt, values)
252
+ else:
253
+ with conn.cursor() as cur:
254
+ cur.executemany(stmt, values)
255
+
256
+ conn.commit()
257
+
258
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
259
+ for content in contents:
260
+ self.upload_contents(content=content)
261
+
262
+
263
+ sql_destination_entry = DestinationRegistryEntry(
264
+ connection_config=SimpleSqlConfig,
265
+ uploader=SQLUploader,
266
+ uploader_config=SQLUploaderConfig,
267
+ upload_stager=SQLUploadStager,
268
+ upload_stager_config=SQLUploadStagerConfig,
269
+ )
@@ -0,0 +1,19 @@
1
+ from datetime import datetime
2
+ from typing import Union
3
+
4
+ from dateutil import parser
5
+
6
+
7
+ def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime:
8
+ if isinstance(date_value, datetime):
9
+ return date_value
10
+ elif isinstance(date_value, float):
11
+ return datetime.fromtimestamp(date_value)
12
+ elif isinstance(date_value, int):
13
+ return datetime.fromtimestamp(date_value / 1000)
14
+
15
+ try:
16
+ timestamp = float(date_value)
17
+ return datetime.fromtimestamp(timestamp)
18
+ except ValueError:
19
+ return parser.parse(date_value)
@@ -0,0 +1,235 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from datetime import date, datetime
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Optional
6
+
7
+ from dateutil import parser
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ FileData,
15
+ UploadContent,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from weaviate import Client
28
+
29
+ CONNECTOR_TYPE = "weaviate"
30
+
31
+
32
+ @dataclass
33
+ class WeaviateAccessConfig(AccessConfig):
34
+ access_token: Optional[str] = None
35
+ api_key: Optional[str] = None
36
+ client_secret: Optional[str] = None
37
+ password: Optional[str] = None
38
+
39
+
40
+ @dataclass
41
+ class WeaviateConnectionConfig(ConnectionConfig):
42
+ host_url: str
43
+ class_name: str
44
+ access_config: WeaviateAccessConfig = enhanced_field(sensitive=True)
45
+ username: Optional[str] = None
46
+ anonymous: bool = False
47
+ scope: Optional[list[str]] = None
48
+ refresh_token: Optional[str] = None
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ @dataclass
53
+ class WeaviateUploadStagerConfig(UploadStagerConfig):
54
+ pass
55
+
56
+
57
+ @dataclass
58
+ class WeaviateUploadStager(UploadStager):
59
+ upload_stager_config: WeaviateUploadStagerConfig = field(
60
+ default_factory=lambda: WeaviateUploadStagerConfig()
61
+ )
62
+
63
+ @staticmethod
64
+ def parse_date_string(date_string: str) -> date:
65
+ try:
66
+ timestamp = float(date_string)
67
+ return datetime.fromtimestamp(timestamp)
68
+ except Exception as e:
69
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
70
+ return parser.parse(date_string)
71
+
72
+ @classmethod
73
+ def conform_dict(cls, data: dict) -> None:
74
+ """
75
+ Updates the element dictionary to conform to the Weaviate schema
76
+ """
77
+
78
+ # Dict as string formatting
79
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
80
+ # Explicit casting otherwise fails schema type checking
81
+ data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
82
+
83
+ # Array of items as string formatting
84
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
85
+ data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
86
+
87
+ if links := data.get("metadata", {}).get("links", {}):
88
+ data["metadata"]["links"] = str(json.dumps(links))
89
+
90
+ if permissions_data := (
91
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
92
+ ):
93
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
94
+
95
+ # Datetime formatting
96
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
97
+ data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
98
+ date_created
99
+ ).strftime(
100
+ "%Y-%m-%dT%H:%M:%S.%fZ",
101
+ )
102
+
103
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
104
+ data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
105
+ date_modified
106
+ ).strftime(
107
+ "%Y-%m-%dT%H:%M:%S.%fZ",
108
+ )
109
+
110
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
111
+ data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
112
+ date_processed
113
+ ).strftime(
114
+ "%Y-%m-%dT%H:%M:%S.%fZ",
115
+ )
116
+
117
+ if last_modified := data.get("metadata", {}).get("last_modified"):
118
+ data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime(
119
+ "%Y-%m-%dT%H:%M:%S.%fZ",
120
+ )
121
+
122
+ # String casting
123
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
124
+ data["metadata"]["data_source"]["version"] = str(version)
125
+
126
+ if page_number := data.get("metadata", {}).get("page_number"):
127
+ data["metadata"]["page_number"] = str(page_number)
128
+
129
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
130
+ data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
131
+
132
+ def run(
133
+ self,
134
+ elements_filepath: Path,
135
+ file_data: FileData,
136
+ output_dir: Path,
137
+ output_filename: str,
138
+ **kwargs: Any,
139
+ ) -> Path:
140
+ with open(elements_filepath) as elements_file:
141
+ elements_contents = json.load(elements_file)
142
+ for element in elements_contents:
143
+ self.conform_dict(data=element)
144
+ output_path = Path(output_dir) / Path(f"{output_filename}.json")
145
+ with open(output_path, "w") as output_file:
146
+ json.dump(elements_contents, output_file)
147
+ return output_path
148
+
149
+
150
+ @dataclass
151
+ class WeaviateUploaderConfig(UploaderConfig):
152
+ batch_size: int = 100
153
+
154
+
155
+ @dataclass
156
+ class WeaviateUploader(Uploader):
157
+ upload_config: WeaviateUploaderConfig
158
+ connection_config: WeaviateConnectionConfig
159
+ client: Optional["Client"] = field(init=False)
160
+ connector_type: str = CONNECTOR_TYPE
161
+
162
+ @requires_dependencies(["weaviate"], extras="weaviate")
163
+ def __post_init__(self):
164
+ from weaviate import Client
165
+
166
+ auth = self._resolve_auth_method()
167
+ self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth)
168
+
169
+ @requires_dependencies(["weaviate"], extras="weaviate")
170
+ def _resolve_auth_method(self):
171
+ access_configs = self.connection_config.access_config
172
+ connection_config = self.connection_config
173
+ if connection_config.anonymous:
174
+ return None
175
+
176
+ if access_configs.access_token:
177
+ from weaviate.auth import AuthBearerToken
178
+
179
+ return AuthBearerToken(
180
+ access_token=access_configs.access_token,
181
+ refresh_token=connection_config.refresh_token,
182
+ )
183
+ elif access_configs.api_key:
184
+ from weaviate.auth import AuthApiKey
185
+
186
+ return AuthApiKey(api_key=access_configs.api_key)
187
+ elif access_configs.client_secret:
188
+ from weaviate.auth import AuthClientCredentials
189
+
190
+ return AuthClientCredentials(
191
+ client_secret=access_configs.client_secret, scope=connection_config.scope
192
+ )
193
+ elif connection_config.username and access_configs.password:
194
+ from weaviate.auth import AuthClientPassword
195
+
196
+ return AuthClientPassword(
197
+ username=connection_config.username,
198
+ password=access_configs.password,
199
+ scope=connection_config.scope,
200
+ )
201
+ return None
202
+
203
+ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
204
+ # TODO update to use async support in weaviate client
205
+ # once the version can be bumped to include it
206
+ elements_dict = []
207
+ for content in contents:
208
+ with open(content.path) as elements_file:
209
+ elements = json.load(elements_file)
210
+ elements_dict.extend(elements)
211
+
212
+ logger.info(
213
+ f"writing {len(elements_dict)} objects to destination "
214
+ f"class {self.connection_config.class_name} "
215
+ f"at {self.connection_config.host_url}",
216
+ )
217
+
218
+ self.client.batch.configure(batch_size=self.upload_config.batch_size)
219
+ with self.client.batch as b:
220
+ for e in elements_dict:
221
+ vector = e.pop("embeddings", None)
222
+ b.add_data_object(
223
+ e,
224
+ self.connection_config.class_name,
225
+ vector=vector,
226
+ )
227
+
228
+
229
+ weaviate_destination_entry = DestinationRegistryEntry(
230
+ connection_config=WeaviateConnectionConfig,
231
+ uploader=WeaviateUploader,
232
+ uploader_config=WeaviateUploaderConfig,
233
+ upload_stager=WeaviateUploadStager,
234
+ upload_stager_config=WeaviateUploadStagerConfig,
235
+ )
@@ -0,0 +1,76 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import Any, Optional
5
+
6
+ from unstructured.documents.elements import Element
7
+ from unstructured.embed.interfaces import BaseEmbeddingEncoder
8
+ from unstructured.staging.base import elements_from_json
9
+
10
+ from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
11
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
12
+
13
+
14
+ @dataclass
15
+ class EmbedderConfig(EnhancedDataClassJsonMixin):
16
+ embedding_provider: Optional[str] = None
17
+ embedding_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
18
+ embedding_model_name: Optional[str] = None
19
+ embedding_aws_access_key_id: Optional[str] = None
20
+ embedding_aws_secret_access_key: Optional[str] = None
21
+ embedding_aws_region: Optional[str] = None
22
+
23
+ def get_embedder(self) -> BaseEmbeddingEncoder:
24
+ kwargs: dict[str, Any] = {}
25
+ if self.embedding_api_key:
26
+ kwargs["api_key"] = self.embedding_api_key
27
+ if self.embedding_model_name:
28
+ kwargs["model_name"] = self.embedding_model_name
29
+ # TODO make this more dynamic to map to encoder configs
30
+ if self.embedding_provider == "langchain-openai":
31
+ from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
32
+
33
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
34
+ elif self.embedding_provider == "langchain-huggingface":
35
+ from unstructured.embed.huggingface import (
36
+ HuggingFaceEmbeddingConfig,
37
+ HuggingFaceEmbeddingEncoder,
38
+ )
39
+
40
+ return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
41
+ elif self.embedding_provider == "octoai":
42
+ from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
43
+
44
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
45
+ elif self.embedding_provider == "langchain-aws-bedrock":
46
+ from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
47
+
48
+ return BedrockEmbeddingEncoder(
49
+ config=BedrockEmbeddingConfig(
50
+ aws_access_key_id=self.embedding_aws_access_key_id,
51
+ aws_secret_access_key=self.embedding_aws_secret_access_key,
52
+ region_name=self.embedding_aws_region,
53
+ )
54
+ )
55
+ elif self.embedding_provider == "langchain-vertexai":
56
+ from unstructured.embed.vertexai import (
57
+ VertexAIEmbeddingConfig,
58
+ VertexAIEmbeddingEncoder,
59
+ )
60
+
61
+ return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
62
+ else:
63
+ raise ValueError(f"{self.embedding_provider} not a recognized encoder")
64
+
65
+
66
+ @dataclass
67
+ class Embedder(BaseProcess, ABC):
68
+ config: EmbedderConfig
69
+
70
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
71
+ # TODO update base embedder classes to support async
72
+ embedder = self.config.get_embedder()
73
+ elements = elements_from_json(filename=str(elements_filepath))
74
+ if not elements:
75
+ return elements
76
+ return embedder.embed_documents(elements=elements)