unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,117 @@
1
+ import asyncio
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class EmbeddingConfig(BaseModel):
10
+ pass
11
+
12
+
13
+ @dataclass
14
+ class BaseEncoder(ABC):
15
+ config: EmbeddingConfig
16
+
17
+ def initialize(self):
18
+ """Initializes the embedding encoder class. Should also validate the instance
19
+ is properly configured: e.g., embed a single a element"""
20
+
21
+ def wrap_error(self, e: Exception) -> Exception:
22
+ """Handle errors from the embedding service. Should raise a more informative error
23
+ if possible"""
24
+ return e
25
+
26
+ @staticmethod
27
+ def _add_embeddings_to_elements(
28
+ elements: list[dict], embeddings: list[list[float]]
29
+ ) -> list[dict]:
30
+ """
31
+ Add embeddings to elements.
32
+
33
+ Args:
34
+ elements (list[Element]): List of elements.
35
+ embeddings (list[list[float]]): List of embeddings.
36
+
37
+ Returns:
38
+ list[Element]: Elements with embeddings added.
39
+ """
40
+ assert len(elements) == len(embeddings)
41
+ elements_w_embedding = []
42
+ for i, element in enumerate(elements):
43
+ element["embeddings"] = embeddings[i]
44
+ elements_w_embedding.append(element)
45
+ return elements
46
+
47
+
48
+ @dataclass
49
+ class BaseEmbeddingEncoder(BaseEncoder, ABC):
50
+
51
+ def initialize(self):
52
+ """Initializes the embedding encoder class. Should also validate the instance
53
+ is properly configured: e.g., embed a single a element"""
54
+
55
+ @property
56
+ def num_of_dimensions(self) -> tuple[int, ...]:
57
+ exemplary_embedding = self.get_exemplary_embedding()
58
+ return np.shape(exemplary_embedding)
59
+
60
+ def get_exemplary_embedding(self) -> list[float]:
61
+ return self.embed_query(query="Q")
62
+
63
+ @property
64
+ def is_unit_vector(self) -> bool:
65
+ """Denotes if the embedding vector is a unit vector."""
66
+ exemplary_embedding = self.get_exemplary_embedding()
67
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
68
+
69
+ @abstractmethod
70
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
71
+ pass
72
+
73
+ @abstractmethod
74
+ def embed_query(self, query: str) -> list[float]:
75
+ pass
76
+
77
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
78
+ results = []
79
+ for text in elements:
80
+ response = self.embed_query(query=text)
81
+ results.append(response)
82
+
83
+ return results
84
+
85
+
86
+ @dataclass
87
+ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
88
+
89
+ async def initialize(self):
90
+ """Initializes the embedding encoder class. Should also validate the instance
91
+ is properly configured: e.g., embed a single a element"""
92
+
93
+ @property
94
+ async def num_of_dimensions(self) -> tuple[int, ...]:
95
+ exemplary_embedding = await self.get_exemplary_embedding()
96
+ return np.shape(exemplary_embedding)
97
+
98
+ async def get_exemplary_embedding(self) -> list[float]:
99
+ return await self.embed_query(query="Q")
100
+
101
+ @property
102
+ async def is_unit_vector(self) -> bool:
103
+ """Denotes if the embedding vector is a unit vector."""
104
+ exemplary_embedding = await self.get_exemplary_embedding()
105
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
106
+
107
+ @abstractmethod
108
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
109
+ pass
110
+
111
+ @abstractmethod
112
+ async def embed_query(self, query: str) -> list[float]:
113
+ pass
114
+
115
+ async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
116
+ results = await asyncio.gather(*[self.embed_query(query=text) for text in elements])
117
+ return results
@@ -0,0 +1,233 @@
1
+ import asyncio
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING
5
+
6
+ from pydantic import Field, SecretStr
7
+
8
+ from unstructured_ingest.embed.interfaces import (
9
+ AsyncBaseEmbeddingEncoder,
10
+ BaseEmbeddingEncoder,
11
+ EmbeddingConfig,
12
+ )
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+
15
+ USER_AGENT = "@mixedbread-ai/unstructured"
16
+ BATCH_SIZE = 128
17
+ TIMEOUT = 60
18
+ MAX_RETRIES = 3
19
+ ENCODING_FORMAT = "float"
20
+ TRUNCATION_STRATEGY = "end"
21
+
22
+
23
+ if TYPE_CHECKING:
24
+ from mixedbread_ai.client import AsyncMixedbreadAI, MixedbreadAI
25
+ from mixedbread_ai.core import RequestOptions
26
+
27
+
28
+ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
29
+ """
30
+ Configuration class for Mixedbread AI Embedding Encoder.
31
+
32
+ Attributes:
33
+ api_key (str): API key for accessing Mixedbread AI..
34
+ embedder_model_name (str): Name of the model to use for embeddings.
35
+ """
36
+
37
+ api_key: SecretStr = Field(
38
+ default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
39
+ )
40
+
41
+ embedder_model_name: str = Field(
42
+ default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
43
+ )
44
+
45
+ @requires_dependencies(
46
+ ["mixedbread_ai"],
47
+ extras="embed-mixedbreadai",
48
+ )
49
+ def get_client(self) -> "MixedbreadAI":
50
+ """
51
+ Create the Mixedbread AI client.
52
+
53
+ Returns:
54
+ MixedbreadAI: Initialized client.
55
+ """
56
+ from mixedbread_ai.client import MixedbreadAI
57
+
58
+ return MixedbreadAI(
59
+ api_key=self.api_key.get_secret_value(),
60
+ )
61
+
62
+ @requires_dependencies(
63
+ ["mixedbread_ai"],
64
+ extras="embed-mixedbreadai",
65
+ )
66
+ def get_async_client(self) -> "AsyncMixedbreadAI":
67
+ from mixedbread_ai.client import AsyncMixedbreadAI
68
+
69
+ return AsyncMixedbreadAI(
70
+ api_key=self.api_key.get_secret_value(),
71
+ )
72
+
73
+
74
+ @dataclass
75
+ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
76
+ """
77
+ Embedding encoder for Mixedbread AI.
78
+
79
+ Attributes:
80
+ config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
81
+ """
82
+
83
+ config: MixedbreadAIEmbeddingConfig
84
+
85
+ def get_exemplary_embedding(self) -> list[float]:
86
+ """Get an exemplary embedding to determine dimensions and unit vector status."""
87
+ return self._embed(["Q"])[0]
88
+
89
+ @requires_dependencies(
90
+ ["mixedbread_ai"],
91
+ extras="embed-mixedbreadai",
92
+ )
93
+ def get_request_options(self) -> "RequestOptions":
94
+ from mixedbread_ai.core import RequestOptions
95
+
96
+ return RequestOptions(
97
+ max_retries=MAX_RETRIES,
98
+ timeout_in_seconds=TIMEOUT,
99
+ additional_headers={"User-Agent": USER_AGENT},
100
+ )
101
+
102
+ def _embed(self, texts: list[str]) -> list[list[float]]:
103
+ """
104
+ Embed a list of texts using the Mixedbread AI API.
105
+
106
+ Args:
107
+ texts (list[str]): List of texts to embed.
108
+
109
+ Returns:
110
+ list[list[float]]: List of embeddings.
111
+ """
112
+ batch_size = BATCH_SIZE
113
+ batch_itr = range(0, len(texts), batch_size)
114
+
115
+ responses = []
116
+ client = self.config.get_client()
117
+ for i in batch_itr:
118
+ batch = texts[i : i + batch_size]
119
+ response = client.embeddings(
120
+ model=self.config.embedder_model_name,
121
+ normalized=True,
122
+ encoding_format=ENCODING_FORMAT,
123
+ truncation_strategy=TRUNCATION_STRATEGY,
124
+ request_options=self.get_request_options(),
125
+ input=batch,
126
+ )
127
+ responses.append(response)
128
+ return [item.embedding for response in responses for item in response.data]
129
+
130
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
131
+ """
132
+ Embed a list of document elements.
133
+
134
+ Args:
135
+ elements (list[Element]): List of document elements.
136
+
137
+ Returns:
138
+ list[Element]: Elements with embeddings.
139
+ """
140
+ embeddings = self._embed([e.get("text", "") for e in elements])
141
+ return self._add_embeddings_to_elements(elements, embeddings)
142
+
143
+ def embed_query(self, query: str) -> list[float]:
144
+ """
145
+ Embed a query string.
146
+
147
+ Args:
148
+ query (str): Query string to embed.
149
+
150
+ Returns:
151
+ list[float]: Embedding of the query.
152
+ """
153
+ return self._embed([query])[0]
154
+
155
+
156
+ @dataclass
157
+ class AsyncMixedbreadAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
158
+
159
+ config: MixedbreadAIEmbeddingConfig
160
+
161
+ async def get_exemplary_embedding(self) -> list[float]:
162
+ """Get an exemplary embedding to determine dimensions and unit vector status."""
163
+ embedding = await self._embed(["Q"])
164
+ return embedding[0]
165
+
166
+ @requires_dependencies(
167
+ ["mixedbread_ai"],
168
+ extras="embed-mixedbreadai",
169
+ )
170
+ def get_request_options(self) -> "RequestOptions":
171
+ from mixedbread_ai.core import RequestOptions
172
+
173
+ return RequestOptions(
174
+ max_retries=MAX_RETRIES,
175
+ timeout_in_seconds=TIMEOUT,
176
+ additional_headers={"User-Agent": USER_AGENT},
177
+ )
178
+
179
+ async def _embed(self, texts: list[str]) -> list[list[float]]:
180
+ """
181
+ Embed a list of texts using the Mixedbread AI API.
182
+
183
+ Args:
184
+ texts (list[str]): List of texts to embed.
185
+
186
+ Returns:
187
+ list[list[float]]: List of embeddings.
188
+ """
189
+ batch_size = BATCH_SIZE
190
+ batch_itr = range(0, len(texts), batch_size)
191
+
192
+ client = self.config.get_async_client()
193
+ tasks = []
194
+ for i in batch_itr:
195
+ batch = texts[i : i + batch_size]
196
+ tasks.append(
197
+ client.embeddings(
198
+ model=self.config.embedder_model_name,
199
+ normalized=True,
200
+ encoding_format=ENCODING_FORMAT,
201
+ truncation_strategy=TRUNCATION_STRATEGY,
202
+ request_options=self.get_request_options(),
203
+ input=batch,
204
+ )
205
+ )
206
+ responses = await asyncio.gather(*tasks)
207
+ return [item.embedding for response in responses for item in response.data]
208
+
209
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
210
+ """
211
+ Embed a list of document elements.
212
+
213
+ Args:
214
+ elements (list[Element]): List of document elements.
215
+
216
+ Returns:
217
+ list[Element]: Elements with embeddings.
218
+ """
219
+ embeddings = await self._embed([e.get("text", "") for e in elements])
220
+ return self._add_embeddings_to_elements(elements, embeddings)
221
+
222
+ async def embed_query(self, query: str) -> list[float]:
223
+ """
224
+ Embed a query string.
225
+
226
+ Args:
227
+ query (str): Query string to embed.
228
+
229
+ Returns:
230
+ list[float]: Embedding of the query.
231
+ """
232
+ embedding = await self._embed([query])
233
+ return embedding[0]
@@ -0,0 +1,130 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import (
7
+ AsyncBaseEmbeddingEncoder,
8
+ BaseEmbeddingEncoder,
9
+ EmbeddingConfig,
10
+ )
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.errors import (
14
+ ProviderError,
15
+ QuotaError,
16
+ RateLimitError,
17
+ UserAuthError,
18
+ UserError,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ from openai import AsyncOpenAI, OpenAI
23
+
24
+
25
+ class OctoAiEmbeddingConfig(EmbeddingConfig):
26
+ api_key: SecretStr
27
+ embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
28
+ base_url: str = Field(default="https://text.octoai.run/v1")
29
+
30
+ def wrap_error(self, e: Exception) -> Exception:
31
+ # https://platform.openai.com/docs/guides/error-codes/api-errors
32
+ from openai import APIStatusError
33
+
34
+ if not isinstance(e, APIStatusError):
35
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
36
+ raise e
37
+ error_code = e.code
38
+ if 400 <= e.status_code < 500:
39
+ # user error
40
+ if e.status_code == 401:
41
+ return UserAuthError(e.message)
42
+ if e.status_code == 429:
43
+ # 429 indicates rate limit exceeded and quote exceeded
44
+ if error_code == "insufficient_quota":
45
+ return QuotaError(e.message)
46
+ else:
47
+ return RateLimitError(e.message)
48
+ return UserError(e.message)
49
+ if e.status_code >= 500:
50
+ return ProviderError(e.message)
51
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
52
+ return e
53
+
54
+ @requires_dependencies(
55
+ ["openai", "tiktoken"],
56
+ extras="embed-octoai",
57
+ )
58
+ def get_client(self) -> "OpenAI":
59
+ """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
60
+ from openai import OpenAI
61
+
62
+ return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
63
+
64
+ @requires_dependencies(
65
+ ["openai", "tiktoken"],
66
+ extras="embed-octoai",
67
+ )
68
+ def get_async_client(self) -> "AsyncOpenAI":
69
+ """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
70
+ from openai import AsyncOpenAI
71
+
72
+ return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
73
+
74
+
75
+ @dataclass
76
+ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
77
+ config: OctoAiEmbeddingConfig
78
+
79
+ def wrap_error(self, e: Exception) -> Exception:
80
+ return self.config.wrap_error(e=e)
81
+
82
+ def embed_query(self, query: str):
83
+ try:
84
+ client = self.config.get_client()
85
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
86
+ except Exception as e:
87
+ raise self.wrap_error(e=e)
88
+ return response.data[0].embedding
89
+
90
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
91
+ texts = [e.get("text", "") for e in elements]
92
+ try:
93
+ client = self.config.get_client()
94
+ response = client.embeddings.create(input=texts, model=self.config.embedder_model_name)
95
+ except Exception as e:
96
+ raise self.wrap_error(e=e)
97
+ embeddings = [data.embedding for data in response.data]
98
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
+ return elements_with_embeddings
100
+
101
+
102
+ @dataclass
103
+ class AsyncOctoAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
104
+ config: OctoAiEmbeddingConfig
105
+
106
+ def wrap_error(self, e: Exception) -> Exception:
107
+ return self.config.wrap_error(e=e)
108
+
109
+ async def embed_query(self, query: str):
110
+ client = self.config.get_async_client()
111
+ try:
112
+ response = await client.embeddings.create(
113
+ input=query, model=self.config.embedder_model_name
114
+ )
115
+ except Exception as e:
116
+ raise self.wrap_error(e=e)
117
+ return response.data[0].embedding
118
+
119
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
120
+ texts = [e.get("text", "") for e in elements]
121
+ client = self.config.get_async_client()
122
+ try:
123
+ response = await client.embeddings.create(
124
+ input=texts, model=self.config.embedder_model_name
125
+ )
126
+ except Exception as e:
127
+ raise self.wrap_error(e=e)
128
+ embeddings = [data.embedding for data in response.data]
129
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
130
+ return elements_with_embeddings
@@ -0,0 +1,116 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import (
7
+ AsyncBaseEmbeddingEncoder,
8
+ BaseEmbeddingEncoder,
9
+ EmbeddingConfig,
10
+ )
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.errors import (
14
+ ProviderError,
15
+ QuotaError,
16
+ RateLimitError,
17
+ UserAuthError,
18
+ UserError,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ from openai import AsyncOpenAI, OpenAI
23
+
24
+
25
+ class OpenAIEmbeddingConfig(EmbeddingConfig):
26
+ api_key: SecretStr
27
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
28
+
29
+ def wrap_error(self, e: Exception) -> Exception:
30
+ # https://platform.openai.com/docs/guides/error-codes/api-errors
31
+ from openai import APIStatusError
32
+
33
+ if not isinstance(e, APIStatusError):
34
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
35
+ raise e
36
+ error_code = e.code
37
+ if 400 <= e.status_code < 500:
38
+ # user error
39
+ if e.status_code == 401:
40
+ return UserAuthError(e.message)
41
+ if e.status_code == 429:
42
+ # 429 indicates rate limit exceeded and quote exceeded
43
+ if error_code == "insufficient_quota":
44
+ return QuotaError(e.message)
45
+ else:
46
+ return RateLimitError(e.message)
47
+ return UserError(e.message)
48
+ if e.status_code >= 500:
49
+ return ProviderError(e.message)
50
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
51
+ return e
52
+
53
+ @requires_dependencies(["openai"], extras="openai")
54
+ def get_client(self) -> "OpenAI":
55
+ from openai import OpenAI
56
+
57
+ return OpenAI(api_key=self.api_key.get_secret_value())
58
+
59
+ @requires_dependencies(["openai"], extras="openai")
60
+ def get_async_client(self) -> "AsyncOpenAI":
61
+ from openai import AsyncOpenAI
62
+
63
+ return AsyncOpenAI(api_key=self.api_key.get_secret_value())
64
+
65
+
66
+ @dataclass
67
+ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
68
+ config: OpenAIEmbeddingConfig
69
+
70
+ def wrap_error(self, e: Exception) -> Exception:
71
+ return self.config.wrap_error(e=e)
72
+
73
+ def embed_query(self, query: str) -> list[float]:
74
+
75
+ client = self.config.get_client()
76
+ try:
77
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
78
+ except Exception as e:
79
+ raise self.wrap_error(e=e)
80
+ return response.data[0].embedding
81
+
82
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
83
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
84
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
85
+ return elements_with_embeddings
86
+
87
+
88
+ @dataclass
89
+ class AsyncOpenAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
90
+ config: OpenAIEmbeddingConfig
91
+
92
+ def wrap_error(self, e: Exception) -> Exception:
93
+ return self.config.wrap_error(e=e)
94
+
95
+ async def embed_query(self, query: str) -> list[float]:
96
+ client = self.config.get_async_client()
97
+ try:
98
+ response = await client.embeddings.create(
99
+ input=query, model=self.config.embedder_model_name
100
+ )
101
+ except Exception as e:
102
+ raise self.wrap_error(e=e)
103
+ return response.data[0].embedding
104
+
105
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
106
+ client = self.config.get_async_client()
107
+ texts = [e.get("text", "") for e in elements]
108
+ try:
109
+ response = await client.embeddings.create(
110
+ input=texts, model=self.config.embedder_model_name
111
+ )
112
+ except Exception as e:
113
+ raise self.wrap_error(e=e)
114
+ embeddings = [data.embedding for data in response.data]
115
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
116
+ return elements_with_embeddings
@@ -0,0 +1,106 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import (
7
+ AsyncBaseEmbeddingEncoder,
8
+ BaseEmbeddingEncoder,
9
+ EmbeddingConfig,
10
+ )
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.errors import (
14
+ RateLimitError as CustomRateLimitError,
15
+ )
16
+ from unstructured_ingest.v2.errors import (
17
+ UserAuthError,
18
+ UserError,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ from together import AsyncTogether, Together
23
+
24
+
25
+ class TogetherAIEmbeddingConfig(EmbeddingConfig):
26
+ api_key: SecretStr
27
+ embedder_model_name: str = Field(
28
+ default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
29
+ )
30
+
31
+ def wrap_error(self, e: Exception) -> Exception:
32
+ # https://docs.together.ai/docs/error-codes
33
+ from together.error import AuthenticationError, RateLimitError, TogetherException
34
+
35
+ if not isinstance(e, TogetherException):
36
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
37
+ return e
38
+ message = e.args[0]
39
+ if isinstance(e, AuthenticationError):
40
+ return UserAuthError(message)
41
+ if isinstance(e, RateLimitError):
42
+ return CustomRateLimitError(message)
43
+ return UserError(message)
44
+
45
+ @requires_dependencies(["together"], extras="togetherai")
46
+ def get_client(self) -> "Together":
47
+ from together import Together
48
+
49
+ return Together(api_key=self.api_key.get_secret_value())
50
+
51
+ @requires_dependencies(["together"], extras="togetherai")
52
+ def get_async_client(self) -> "AsyncTogether":
53
+ from together import AsyncTogether
54
+
55
+ return AsyncTogether(api_key=self.api_key.get_secret_value())
56
+
57
+
58
+ @dataclass
59
+ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
60
+ config: TogetherAIEmbeddingConfig
61
+
62
+ def wrap_error(self, e: Exception) -> Exception:
63
+ return self.config.wrap_error(e=e)
64
+
65
+ def embed_query(self, query: str) -> list[float]:
66
+ return self._embed_documents(elements=[query])[0]
67
+
68
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
69
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
70
+ return self._add_embeddings_to_elements(elements, embeddings)
71
+
72
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
73
+ client = self.config.get_client()
74
+ try:
75
+ outputs = client.embeddings.create(
76
+ model=self.config.embedder_model_name, input=elements
77
+ )
78
+ except Exception as e:
79
+ raise self.wrap_error(e=e)
80
+ return [outputs.data[i].embedding for i in range(len(elements))]
81
+
82
+
83
+ @dataclass
84
+ class AsyncTogetherAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
85
+ config: TogetherAIEmbeddingConfig
86
+
87
+ def wrap_error(self, e: Exception) -> Exception:
88
+ return self.config.wrap_error(e=e)
89
+
90
+ async def embed_query(self, query: str) -> list[float]:
91
+ embedding = await self._embed_documents(elements=[query])
92
+ return embedding[0]
93
+
94
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
95
+ embeddings = await self._embed_documents([e.get("text", "") for e in elements])
96
+ return self._add_embeddings_to_elements(elements, embeddings)
97
+
98
+ async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
99
+ client = self.config.get_async_client()
100
+ try:
101
+ outputs = await client.embeddings.create(
102
+ model=self.config.embedder_model_name, input=elements
103
+ )
104
+ except Exception as e:
105
+ raise self.wrap_error(e=e)
106
+ return [outputs.data[i].embedding for i in range(len(elements))]