unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,59 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
7
+ from test.integration.utils import requires_env
8
+ from unstructured_ingest.embed.azure_openai import (
9
+ AzureOpenAIEmbeddingConfig,
10
+ AzureOpenAIEmbeddingEncoder,
11
+ )
12
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
13
+
14
+ API_KEY = "AZURE_OPENAI_API_KEY"
15
+ ENDPOINT = "AZURE_OPENAI_ENDPOINT"
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class AzureData:
20
+ api_key: str
21
+ endpoint: str
22
+
23
+
24
+ def get_azure_data() -> AzureData:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ endpoint = os.getenv(ENDPOINT, None)
28
+ assert endpoint
29
+ return AzureData(api_key, endpoint)
30
+
31
+
32
+ @requires_env(API_KEY, ENDPOINT)
33
+ def test_azure_openai_embedder(embedder_file: Path):
34
+ azure_data = get_azure_data()
35
+ embedder_config = EmbedderConfig(
36
+ embedding_provider="azure-openai",
37
+ embedding_api_key=azure_data.api_key,
38
+ embedding_azure_endpoint=azure_data.endpoint,
39
+ )
40
+ embedder = Embedder(config=embedder_config)
41
+ results = embedder.run(elements_filepath=embedder_file)
42
+ assert results
43
+ with embedder_file.open("r") as f:
44
+ original_elements = json.load(f)
45
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
46
+
47
+
48
+ @requires_env(API_KEY, ENDPOINT)
49
+ def test_raw_azure_openai_embedder(embedder_file: Path):
50
+ azure_data = get_azure_data()
51
+ embedder = AzureOpenAIEmbeddingEncoder(
52
+ config=AzureOpenAIEmbeddingConfig(
53
+ api_key=azure_data.api_key,
54
+ azure_endpoint=azure_data.endpoint,
55
+ )
56
+ )
57
+ validate_raw_embedder(
58
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
59
+ )
@@ -0,0 +1,103 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.bedrock import (
14
+ AsyncBedrockEmbeddingEncoder,
15
+ BedrockEmbeddingConfig,
16
+ BedrockEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.errors import UserAuthError, UserError
19
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
20
+
21
+
22
+ def get_aws_credentials() -> dict:
23
+ access_key = os.getenv("AWS_ACCESS_KEY_ID", None)
24
+ assert access_key
25
+ secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", None)
26
+ assert secret_key
27
+ return {"aws_access_key_id": access_key, "aws_secret_access_key": secret_key}
28
+
29
+
30
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
31
+ def test_bedrock_embedder(embedder_file: Path):
32
+ aws_credentials = get_aws_credentials()
33
+ embedder_config = EmbedderConfig(
34
+ embedding_provider="aws-bedrock",
35
+ embedding_aws_access_key_id=aws_credentials["aws_access_key_id"],
36
+ embedding_aws_secret_access_key=aws_credentials["aws_secret_access_key"],
37
+ )
38
+ embedder = Embedder(config=embedder_config)
39
+ results = embedder.run(elements_filepath=embedder_file)
40
+ assert results
41
+ with embedder_file.open("r") as f:
42
+ original_elements = json.load(f)
43
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
44
+
45
+
46
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
47
+ def test_raw_bedrock_embedder(embedder_file: Path):
48
+ aws_credentials = get_aws_credentials()
49
+ embedder = BedrockEmbeddingEncoder(
50
+ config=BedrockEmbeddingConfig(
51
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
52
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
53
+ )
54
+ )
55
+ validate_raw_embedder(
56
+ embedder=embedder,
57
+ embedder_file=embedder_file,
58
+ expected_dimensions=(1536,),
59
+ expected_is_unit_vector=False,
60
+ )
61
+
62
+
63
+ def test_raw_bedrock_embedder_invalid_credentials(embedder_file: Path):
64
+ embedder = BedrockEmbeddingEncoder(
65
+ config=BedrockEmbeddingConfig(
66
+ aws_access_key_id="no_key",
67
+ aws_secret_access_key="no_secret",
68
+ )
69
+ )
70
+ with pytest.raises(UserAuthError):
71
+ embedder.get_exemplary_embedding()
72
+
73
+
74
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
75
+ def test_raw_bedrock_embedder_invalid_model(embedder_file: Path):
76
+ aws_credentials = get_aws_credentials()
77
+ embedder = BedrockEmbeddingEncoder(
78
+ config=BedrockEmbeddingConfig(
79
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
80
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
81
+ model_name="invalid_model",
82
+ )
83
+ )
84
+ with pytest.raises(UserError):
85
+ embedder.get_exemplary_embedding()
86
+
87
+
88
+ @requires_env("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
89
+ @pytest.mark.asyncio
90
+ async def test_raw_async_bedrock_embedder(embedder_file: Path):
91
+ aws_credentials = get_aws_credentials()
92
+ embedder = AsyncBedrockEmbeddingEncoder(
93
+ config=BedrockEmbeddingConfig(
94
+ aws_access_key_id=aws_credentials["aws_access_key_id"],
95
+ aws_secret_access_key=aws_credentials["aws_secret_access_key"],
96
+ )
97
+ )
98
+ await validate_raw_embedder_async(
99
+ embedder=embedder,
100
+ embedder_file=embedder_file,
101
+ expected_dimensions=(1536,),
102
+ expected_is_unit_vector=False,
103
+ )
@@ -0,0 +1,26 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from test.integration.embedders.utils import validate_embedding_output, validate_raw_embedder
5
+ from unstructured_ingest.embed.huggingface import (
6
+ HuggingFaceEmbeddingConfig,
7
+ HuggingFaceEmbeddingEncoder,
8
+ )
9
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
10
+
11
+
12
+ def test_huggingface_embedder(embedder_file: Path):
13
+ embedder_config = EmbedderConfig(embedding_provider="huggingface")
14
+ embedder = Embedder(config=embedder_config)
15
+ results = embedder.run(elements_filepath=embedder_file)
16
+ assert results
17
+ with embedder_file.open("r") as f:
18
+ original_elements = json.load(f)
19
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
20
+
21
+
22
+ def test_raw_hugginface_embedder(embedder_file: Path):
23
+ embedder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
24
+ validate_raw_embedder(
25
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(384,)
26
+ )
@@ -0,0 +1,71 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.mixedbreadai import (
14
+ AsyncMixedbreadAIEmbeddingEncoder,
15
+ MixedbreadAIEmbeddingConfig,
16
+ MixedbreadAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
19
+
20
+ API_KEY = "MXBAI_API_KEY"
21
+
22
+
23
+ def get_api_key() -> str:
24
+ api_key = os.getenv(API_KEY, None)
25
+ assert api_key
26
+ return api_key
27
+
28
+
29
+ @requires_env(API_KEY)
30
+ def test_mixedbread_embedder(embedder_file: Path):
31
+ api_key = get_api_key()
32
+ embedder_config = EmbedderConfig(embedding_provider="mixedbread-ai", embedding_api_key=api_key)
33
+ embedder = Embedder(config=embedder_config)
34
+ results = embedder.run(elements_filepath=embedder_file)
35
+ assert results
36
+ with embedder_file.open("r") as f:
37
+ original_elements = json.load(f)
38
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
39
+
40
+
41
+ @requires_env(API_KEY)
42
+ def test_raw_mixedbread_embedder(embedder_file: Path):
43
+ api_key = get_api_key()
44
+ embedder = MixedbreadAIEmbeddingEncoder(
45
+ config=MixedbreadAIEmbeddingConfig(
46
+ api_key=api_key,
47
+ )
48
+ )
49
+ validate_raw_embedder(
50
+ embedder=embedder,
51
+ embedder_file=embedder_file,
52
+ expected_dimensions=(1024,),
53
+ expected_is_unit_vector=False,
54
+ )
55
+
56
+
57
+ @requires_env(API_KEY)
58
+ @pytest.mark.asyncio
59
+ async def test_raw_async_mixedbread_embedder(embedder_file: Path):
60
+ api_key = get_api_key()
61
+ embedder = AsyncMixedbreadAIEmbeddingEncoder(
62
+ config=MixedbreadAIEmbeddingConfig(
63
+ api_key=api_key,
64
+ )
65
+ )
66
+ await validate_raw_embedder_async(
67
+ embedder=embedder,
68
+ embedder_file=embedder_file,
69
+ expected_dimensions=(1024,),
70
+ expected_is_unit_vector=False,
71
+ )
@@ -0,0 +1,77 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.octoai import (
14
+ AsyncOctoAIEmbeddingEncoder,
15
+ OctoAiEmbeddingConfig,
16
+ OctoAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.errors import UserAuthError
19
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
20
+
21
+ API_KEY = "OCTOAI_API_KEY"
22
+
23
+
24
+ def get_api_key() -> str:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ return api_key
28
+
29
+
30
+ @requires_env(API_KEY)
31
+ def test_octoai_embedder(embedder_file: Path):
32
+ api_key = get_api_key()
33
+ embedder_config = EmbedderConfig(embedding_provider="octoai", embedding_api_key=api_key)
34
+ embedder = Embedder(config=embedder_config)
35
+ results = embedder.run(elements_filepath=embedder_file)
36
+ assert results
37
+ with embedder_file.open("r") as f:
38
+ original_elements = json.load(f)
39
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
40
+
41
+
42
+ @requires_env(API_KEY)
43
+ def test_raw_octoai_embedder(embedder_file: Path):
44
+ api_key = get_api_key()
45
+ embedder = OctoAIEmbeddingEncoder(
46
+ config=OctoAiEmbeddingConfig(
47
+ api_key=api_key,
48
+ )
49
+ )
50
+ validate_raw_embedder(
51
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
52
+ )
53
+
54
+
55
+ @pytest.mark.skip(reason="Unexpected connection error at the moment")
56
+ def test_raw_octoai_embedder_invalid_credentials():
57
+ embedder = OctoAIEmbeddingEncoder(
58
+ config=OctoAiEmbeddingConfig(
59
+ api_key="fake_api_key",
60
+ )
61
+ )
62
+ with pytest.raises(UserAuthError):
63
+ embedder.get_exemplary_embedding()
64
+
65
+
66
+ @requires_env(API_KEY)
67
+ @pytest.mark.asyncio
68
+ async def test_raw_async_octoai_embedder(embedder_file: Path):
69
+ api_key = get_api_key()
70
+ embedder = AsyncOctoAIEmbeddingEncoder(
71
+ config=OctoAiEmbeddingConfig(
72
+ api_key=api_key,
73
+ )
74
+ )
75
+ await validate_raw_embedder_async(
76
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
77
+ )
@@ -0,0 +1,76 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.openai import (
14
+ AsyncOpenAIEmbeddingEncoder,
15
+ OpenAIEmbeddingConfig,
16
+ OpenAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.errors import UserAuthError
19
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
20
+
21
+ API_KEY = "OPENAI_API_KEY"
22
+
23
+
24
+ def get_api_key() -> str:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ return api_key
28
+
29
+
30
+ @requires_env(API_KEY)
31
+ def test_openai_embedder(embedder_file: Path):
32
+ api_key = get_api_key()
33
+ embedder_config = EmbedderConfig(embedding_provider="openai", embedding_api_key=api_key)
34
+ embedder = Embedder(config=embedder_config)
35
+ results = embedder.run(elements_filepath=embedder_file)
36
+ assert results
37
+ with embedder_file.open("r") as f:
38
+ original_elements = json.load(f)
39
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
40
+
41
+
42
+ @requires_env(API_KEY)
43
+ def test_raw_openai_embedder(embedder_file: Path):
44
+ api_key = get_api_key()
45
+ embedder = OpenAIEmbeddingEncoder(
46
+ config=OpenAIEmbeddingConfig(
47
+ api_key=api_key,
48
+ )
49
+ )
50
+ validate_raw_embedder(
51
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
52
+ )
53
+
54
+
55
+ def test_raw_openai_embedder_invalid_credentials():
56
+ embedder = OpenAIEmbeddingEncoder(
57
+ config=OpenAIEmbeddingConfig(
58
+ api_key="fake_api_key",
59
+ )
60
+ )
61
+ with pytest.raises(UserAuthError):
62
+ embedder.get_exemplary_embedding()
63
+
64
+
65
+ @requires_env(API_KEY)
66
+ @pytest.mark.asyncio
67
+ async def test_raw_async_openai_embedder(embedder_file: Path):
68
+ api_key = get_api_key()
69
+ embedder = AsyncOpenAIEmbeddingEncoder(
70
+ config=OpenAIEmbeddingConfig(
71
+ api_key=api_key,
72
+ )
73
+ )
74
+ await validate_raw_embedder_async(
75
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1536,)
76
+ )
@@ -0,0 +1,71 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.togetherai import (
14
+ AsyncTogetherAIEmbeddingEncoder,
15
+ TogetherAIEmbeddingConfig,
16
+ TogetherAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.errors import UserAuthError
19
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
20
+
21
+ API_KEY = "TOGETHERAI_API_KEY"
22
+
23
+
24
+ def get_api_key() -> str:
25
+ api_key = os.getenv(API_KEY, None)
26
+ assert api_key
27
+ return api_key
28
+
29
+
30
+ @requires_env(API_KEY)
31
+ def test_togetherai_embedder(embedder_file: Path):
32
+ api_key = get_api_key()
33
+ embedder_config = EmbedderConfig(embedding_provider="togetherai", embedding_api_key=api_key)
34
+ embedder = Embedder(config=embedder_config)
35
+ results = embedder.run(elements_filepath=embedder_file)
36
+ assert results
37
+ with embedder_file.open("r") as f:
38
+ original_elements = json.load(f)
39
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
40
+
41
+
42
+ @requires_env(API_KEY)
43
+ def test_raw_togetherai_embedder(embedder_file: Path):
44
+ api_key = get_api_key()
45
+ embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
46
+ validate_raw_embedder(
47
+ embedder=embedder,
48
+ embedder_file=embedder_file,
49
+ expected_dimensions=(768,),
50
+ expected_is_unit_vector=False,
51
+ )
52
+
53
+
54
+ def test_raw_togetherai_embedder_invalid_credentials():
55
+ embedder = TogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key="fake_api_key"))
56
+
57
+ with pytest.raises(UserAuthError):
58
+ embedder.get_exemplary_embedding()
59
+
60
+
61
+ @requires_env(API_KEY)
62
+ @pytest.mark.asyncio
63
+ async def test_raw_async_togetherai_embedder(embedder_file: Path):
64
+ api_key = get_api_key()
65
+ embedder = AsyncTogetherAIEmbeddingEncoder(config=TogetherAIEmbeddingConfig(api_key=api_key))
66
+ await validate_raw_embedder_async(
67
+ embedder=embedder,
68
+ embedder_file=embedder_file,
69
+ expected_dimensions=(768,),
70
+ expected_is_unit_vector=False,
71
+ )
@@ -0,0 +1,65 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.vertexai import (
14
+ AsyncVertexAIEmbeddingEncoder,
15
+ VertexAIEmbeddingConfig,
16
+ VertexAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
19
+
20
+ API_KEY = "VERTEXAI_API_KEY"
21
+
22
+
23
+ def get_api_key() -> str:
24
+ api_key = os.getenv(API_KEY, None)
25
+ assert api_key
26
+ return api_key
27
+
28
+
29
+ @requires_env(API_KEY)
30
+ def test_vertexai_embedder(embedder_file: Path):
31
+ api_key = get_api_key()
32
+ embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
33
+ embedder = Embedder(config=embedder_config)
34
+ results = embedder.run(elements_filepath=embedder_file)
35
+ assert results
36
+ with embedder_file.open("r") as f:
37
+ original_elements = json.load(f)
38
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
39
+
40
+
41
+ @requires_env(API_KEY)
42
+ def test_raw_vertexai_embedder(embedder_file: Path):
43
+ api_key = get_api_key()
44
+ embedder = VertexAIEmbeddingEncoder(
45
+ config=VertexAIEmbeddingConfig(
46
+ api_key=api_key,
47
+ )
48
+ )
49
+ validate_raw_embedder(
50
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
51
+ )
52
+
53
+
54
+ @requires_env(API_KEY)
55
+ @pytest.mark.asyncio
56
+ async def test_raw_async_vertexai_embedder(embedder_file: Path):
57
+ api_key = get_api_key()
58
+ embedder = AsyncVertexAIEmbeddingEncoder(
59
+ config=VertexAIEmbeddingConfig(
60
+ api_key=api_key,
61
+ )
62
+ )
63
+ await validate_raw_embedder_async(
64
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(768,)
65
+ )
@@ -0,0 +1,65 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.embedders.utils import (
8
+ validate_embedding_output,
9
+ validate_raw_embedder,
10
+ validate_raw_embedder_async,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.embed.voyageai import (
14
+ AsyncVoyageAIEmbeddingEncoder,
15
+ VoyageAIEmbeddingConfig,
16
+ VoyageAIEmbeddingEncoder,
17
+ )
18
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
19
+
20
+ API_KEY = "VOYAGEAI_API_KEY"
21
+
22
+
23
+ def get_api_key() -> str:
24
+ api_key = os.getenv(API_KEY, None)
25
+ assert api_key
26
+ return api_key
27
+
28
+
29
+ @requires_env(API_KEY)
30
+ def test_voyageai_embedder(embedder_file: Path):
31
+ api_key = get_api_key()
32
+ embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
33
+ embedder = Embedder(config=embedder_config)
34
+ results = embedder.run(elements_filepath=embedder_file)
35
+ assert results
36
+ with embedder_file.open("r") as f:
37
+ original_elements = json.load(f)
38
+ validate_embedding_output(original_elements=original_elements, output_elements=results)
39
+
40
+
41
+ @requires_env(API_KEY)
42
+ def test_raw_voyageai_embedder(embedder_file: Path):
43
+ api_key = get_api_key()
44
+ embedder = VoyageAIEmbeddingEncoder(
45
+ config=VoyageAIEmbeddingConfig(
46
+ api_key=api_key,
47
+ )
48
+ )
49
+ validate_raw_embedder(
50
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
51
+ )
52
+
53
+
54
+ @requires_env(API_KEY)
55
+ @pytest.mark.asyncio
56
+ async def test_raw_async_voyageai_embedder(embedder_file: Path):
57
+ api_key = get_api_key()
58
+ embedder = AsyncVoyageAIEmbeddingEncoder(
59
+ config=VoyageAIEmbeddingConfig(
60
+ api_key=api_key,
61
+ )
62
+ )
63
+ await validate_raw_embedder_async(
64
+ embedder=embedder, embedder_file=embedder_file, expected_dimensions=(1024,)
65
+ )
@@ -0,0 +1,68 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from unstructured_ingest.embed.interfaces import AsyncBaseEmbeddingEncoder, BaseEmbeddingEncoder
6
+
7
+
8
+ def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
9
+ """
10
+ Make sure the following characteristics are met:
11
+ * The same number of elements are returned
12
+ * For each element that had text, an embeddings entry was added in the output
13
+ * Other than the embedding, nothing about the element was changed
14
+ """
15
+ assert len(original_elements) == len(output_elements)
16
+ for original_element, output_element in zip(original_elements, output_elements):
17
+ if original_element.get("text"):
18
+ assert output_element.get("embeddings", None)
19
+ output_element.pop("embeddings", None)
20
+ assert original_element == output_element
21
+
22
+
23
+ def validate_raw_embedder(
24
+ embedder: BaseEmbeddingEncoder,
25
+ embedder_file: Path,
26
+ expected_dimensions: Optional[tuple[int, ...]] = None,
27
+ expected_is_unit_vector: bool = True,
28
+ ):
29
+ with open(embedder_file) as f:
30
+ elements = json.load(f)
31
+ all_text = [element["text"] for element in elements]
32
+ single_text = all_text[0]
33
+ num_of_dimensions = embedder.num_of_dimensions
34
+ if expected_dimensions:
35
+ assert (
36
+ num_of_dimensions == expected_dimensions
37
+ ), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
38
+ is_unit_vector = embedder.is_unit_vector
39
+ assert is_unit_vector == expected_is_unit_vector
40
+ single_embedding = embedder.embed_query(query=single_text)
41
+ expected_length = num_of_dimensions[0]
42
+ assert len(single_embedding) == expected_length
43
+ embedded_elements = embedder.embed_documents(elements=elements)
44
+ validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
45
+
46
+
47
+ async def validate_raw_embedder_async(
48
+ embedder: AsyncBaseEmbeddingEncoder,
49
+ embedder_file: Path,
50
+ expected_dimensions: Optional[tuple[int, ...]] = None,
51
+ expected_is_unit_vector: bool = True,
52
+ ):
53
+ with open(embedder_file) as f:
54
+ elements = json.load(f)
55
+ all_text = [element["text"] for element in elements]
56
+ single_text = all_text[0]
57
+ num_of_dimensions = await embedder.num_of_dimensions
58
+ if expected_dimensions:
59
+ assert (
60
+ num_of_dimensions == expected_dimensions
61
+ ), f"number of dimensions {num_of_dimensions} didn't match expected: {expected_dimensions}"
62
+ is_unit_vector = await embedder.is_unit_vector
63
+ assert is_unit_vector == expected_is_unit_vector
64
+ single_embedding = await embedder.embed_query(query=single_text)
65
+ expected_length = num_of_dimensions[0]
66
+ assert len(single_embedding) == expected_length
67
+ embedded_elements = await embedder.embed_documents(elements=elements)
68
+ validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
File without changes