unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,126 @@
1
+ # type: ignore
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Annotated, Any, Optional
7
+
8
+ from pydantic import Field, Secret, ValidationError
9
+ from pydantic.functional_validators import BeforeValidator
10
+
11
+ from unstructured_ingest.embed.interfaces import (
12
+ AsyncBaseEmbeddingEncoder,
13
+ BaseEmbeddingEncoder,
14
+ EmbeddingConfig,
15
+ )
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+ from unstructured_ingest.v2.errors import UserAuthError
18
+
19
+ if TYPE_CHECKING:
20
+ from vertexai.language_models import TextEmbeddingModel
21
+
22
+
23
+ def conform_string_to_dict(value: Any) -> dict:
24
+ if isinstance(value, dict):
25
+ return value
26
+ if isinstance(value, str):
27
+ return json.loads(value)
28
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
29
+
30
+
31
+ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
32
+
33
+
34
+ class VertexAIEmbeddingConfig(EmbeddingConfig):
35
+ api_key: ApiKeyType
36
+ embedder_model_name: Optional[str] = Field(
37
+ default="textembedding-gecko@001", alias="model_name"
38
+ )
39
+
40
+ def wrap_error(self, e: Exception) -> Exception:
41
+ from google.auth.exceptions import GoogleAuthError
42
+
43
+ if isinstance(e, GoogleAuthError):
44
+ return UserAuthError(e)
45
+ return e
46
+
47
+ def register_application_credentials(self):
48
+ # TODO look into passing credentials in directly, rather than via env var and tmp file
49
+ application_credentials_path = Path("/tmp") / "google-vertex-app-credentials.json"
50
+ with application_credentials_path.open("w+") as credentials_file:
51
+ json.dump(self.api_key.get_secret_value(), credentials_file)
52
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
53
+
54
+ @requires_dependencies(
55
+ ["vertexai"],
56
+ extras="embed-vertexai",
57
+ )
58
+ def get_client(self) -> "TextEmbeddingModel":
59
+ """Creates a VertexAI python client to embed elements."""
60
+ from vertexai.language_models import TextEmbeddingModel
61
+
62
+ self.register_application_credentials()
63
+ return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
64
+
65
+
66
+ @dataclass
67
+ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
68
+ config: VertexAIEmbeddingConfig
69
+
70
+ def wrap_error(self, e: Exception) -> Exception:
71
+ return self.config.wrap_error(e=e)
72
+
73
+ def embed_query(self, query):
74
+ return self._embed_documents(elements=[query])[0]
75
+
76
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
77
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
78
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
79
+ return elements_with_embeddings
80
+
81
+ @requires_dependencies(
82
+ ["vertexai"],
83
+ extras="embed-vertexai",
84
+ )
85
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
86
+ from vertexai.language_models import TextEmbeddingInput
87
+
88
+ inputs = [TextEmbeddingInput(text=element) for element in elements]
89
+ try:
90
+ client = self.config.get_client()
91
+ embeddings = client.get_embeddings(inputs)
92
+ except Exception as e:
93
+ raise self.wrap_error(e=e)
94
+ return [e.values for e in embeddings]
95
+
96
+
97
+ @dataclass
98
+ class AsyncVertexAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
99
+ config: VertexAIEmbeddingConfig
100
+
101
+ def wrap_error(self, e: Exception) -> Exception:
102
+ return self.config.wrap_error(e=e)
103
+
104
+ async def embed_query(self, query):
105
+ embedding = await self._embed_documents(elements=[query])
106
+ return embedding[0]
107
+
108
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
109
+ embeddings = await self._embed_documents([e.get("text", "") for e in elements])
110
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
111
+ return elements_with_embeddings
112
+
113
+ @requires_dependencies(
114
+ ["vertexai"],
115
+ extras="embed-vertexai",
116
+ )
117
+ async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
118
+ from vertexai.language_models import TextEmbeddingInput
119
+
120
+ inputs = [TextEmbeddingInput(text=element) for element in elements]
121
+ try:
122
+ client = self.config.get_client()
123
+ embeddings = await client.get_embeddings_async(inputs)
124
+ except Exception as e:
125
+ raise self.wrap_error(e=e)
126
+ return [e.values for e in embeddings]
@@ -0,0 +1,130 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import (
7
+ AsyncBaseEmbeddingEncoder,
8
+ BaseEmbeddingEncoder,
9
+ EmbeddingConfig,
10
+ )
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.errors import (
14
+ ProviderError,
15
+ UserAuthError,
16
+ UserError,
17
+ )
18
+ from unstructured_ingest.v2.errors import (
19
+ RateLimitError as CustomRateLimitError,
20
+ )
21
+
22
+ if TYPE_CHECKING:
23
+ from voyageai import AsyncClient as AsyncVoyageAIClient
24
+ from voyageai import Client as VoyageAIClient
25
+
26
+
27
+ class VoyageAIEmbeddingConfig(EmbeddingConfig):
28
+ api_key: SecretStr
29
+ embedder_model_name: str = Field(default="voyage-3", alias="model_name")
30
+ batch_size: Optional[int] = Field(default=None)
31
+ truncation: Optional[bool] = Field(default=None)
32
+ max_retries: int = 0
33
+ timeout_in_seconds: Optional[int] = None
34
+
35
+ def wrap_error(self, e: Exception) -> Exception:
36
+ # https://docs.voyageai.com/docs/error-codes
37
+ from voyageai.error import AuthenticationError, RateLimitError, VoyageError
38
+
39
+ if not isinstance(e, VoyageError):
40
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
41
+ raise e
42
+ http_code = e.http_status
43
+ message = e.user_message
44
+ if isinstance(e, AuthenticationError):
45
+ return UserAuthError(message)
46
+ if isinstance(e, RateLimitError):
47
+ return CustomRateLimitError(message)
48
+ if 400 <= http_code < 500:
49
+ return UserError(message)
50
+ if http_code >= 500:
51
+ return ProviderError(message)
52
+ logger.error(f"unhandled exception from openai: {e}", exc_info=True)
53
+ return e
54
+
55
+ @requires_dependencies(
56
+ ["voyageai"],
57
+ extras="embed-voyageai",
58
+ )
59
+ def get_client(self) -> "VoyageAIClient":
60
+ """Creates a VoyageAI python client to embed elements."""
61
+ from voyageai import Client as VoyageAIClient
62
+
63
+ client = VoyageAIClient(
64
+ api_key=self.api_key.get_secret_value(),
65
+ max_retries=self.max_retries,
66
+ timeout=self.timeout_in_seconds,
67
+ )
68
+ return client
69
+
70
+ @requires_dependencies(
71
+ ["voyageai"],
72
+ extras="embed-voyageai",
73
+ )
74
+ def get_async_client(self) -> "AsyncVoyageAIClient":
75
+ """Creates a VoyageAI python client to embed elements."""
76
+ from voyageai import AsyncClient as AsyncVoyageAIClient
77
+
78
+ client = AsyncVoyageAIClient(
79
+ api_key=self.api_key.get_secret_value(),
80
+ max_retries=self.max_retries,
81
+ timeout=self.timeout_in_seconds,
82
+ )
83
+ return client
84
+
85
+
86
+ @dataclass
87
+ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
88
+ config: VoyageAIEmbeddingConfig
89
+
90
+ def wrap_error(self, e: Exception) -> Exception:
91
+ return self.config.wrap_error(e=e)
92
+
93
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
94
+ client: VoyageAIClient = self.config.get_client()
95
+ try:
96
+ response = client.embed(texts=elements, model=self.config.embedder_model_name)
97
+ except Exception as e:
98
+ raise self.wrap_error(e=e)
99
+ return response.embeddings
100
+
101
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
102
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
103
+ return self._add_embeddings_to_elements(elements, embeddings)
104
+
105
+ def embed_query(self, query: str) -> list[float]:
106
+ return self._embed_documents(elements=[query])[0]
107
+
108
+
109
+ @dataclass
110
+ class AsyncVoyageAIEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
111
+ config: VoyageAIEmbeddingConfig
112
+
113
+ def wrap_error(self, e: Exception) -> Exception:
114
+ return self.config.wrap_error(e=e)
115
+
116
+ async def _embed_documents(self, elements: list[str]) -> list[list[float]]:
117
+ client = self.config.get_async_client()
118
+ try:
119
+ response = await client.embed(texts=elements, model=self.config.embedder_model_name)
120
+ except Exception as e:
121
+ raise self.wrap_error(e=e)
122
+ return response.embeddings
123
+
124
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
125
+ embeddings = await self._embed_documents([e.get("text", "") for e in elements])
126
+ return self._add_embeddings_to_elements(elements, embeddings)
127
+
128
+ async def embed_query(self, query: str) -> list[float]:
129
+ embedding = await self._embed_documents(elements=[query])
130
+ return embedding[0]
@@ -0,0 +1,4 @@
1
+ from .dataclasses import enhanced_field
2
+ from .json_mixin import EnhancedDataClassJsonMixin
3
+
4
+ __all__ = ["enhanced_field", "EnhancedDataClassJsonMixin"]
@@ -0,0 +1,99 @@
1
+ import _thread
2
+ import copy
3
+ import functools
4
+ from dataclasses import fields
5
+
6
+ from dataclasses_json.core import (
7
+ Collection,
8
+ Enum,
9
+ Mapping,
10
+ _encode_overrides,
11
+ _handle_undefined_parameters_safe,
12
+ _user_overrides_or_exts,
13
+ is_dataclass,
14
+ )
15
+
16
+
17
+ def _recursive_repr(user_function):
18
+ # Copied from dataclasses as this method isn't exposed for importing
19
+ repr_running = set()
20
+
21
+ @functools.wraps(user_function)
22
+ def wrapper(self):
23
+ key = id(self), _thread.get_ident()
24
+ if key in repr_running:
25
+ return "..."
26
+ repr_running.add(key)
27
+ try:
28
+ result = user_function(self)
29
+ finally:
30
+ repr_running.discard(key)
31
+ return result
32
+
33
+ return wrapper
34
+
35
+
36
+ def _asdict(
37
+ obj,
38
+ encode_json=False,
39
+ redact_sensitive=False,
40
+ redacted_text="***REDACTED***",
41
+ apply_name_overload: bool = True,
42
+ ):
43
+ """
44
+ A re-implementation of `asdict` (based on the original in the `dataclasses`
45
+ source) to support arbitrary Collection and Mapping types.
46
+ """
47
+ if is_dataclass(obj):
48
+ result = []
49
+ overrides = _user_overrides_or_exts(obj)
50
+ for field in fields(obj):
51
+ if overrides[field.name].encoder:
52
+ value = getattr(obj, field.name)
53
+ else:
54
+ value = _asdict(
55
+ getattr(obj, field.name),
56
+ encode_json=encode_json,
57
+ redact_sensitive=redact_sensitive,
58
+ redacted_text=redacted_text,
59
+ apply_name_overload=apply_name_overload,
60
+ )
61
+ if getattr(field, "sensitive", False) and redact_sensitive and value:
62
+ value = redacted_text
63
+ if getattr(field, "overload_name", None) and apply_name_overload:
64
+ overload_name = getattr(field, "overload_name")
65
+ result.append((overload_name, value))
66
+ else:
67
+ result.append((field.name, value))
68
+
69
+ result = _handle_undefined_parameters_safe(cls=obj, kvs=dict(result), usage="to")
70
+ return _encode_overrides(
71
+ dict(result), _user_overrides_or_exts(obj), encode_json=encode_json
72
+ )
73
+ elif isinstance(obj, Mapping):
74
+ return {
75
+ _asdict(
76
+ k,
77
+ encode_json=encode_json,
78
+ redact_sensitive=redact_sensitive,
79
+ redacted_text=redacted_text,
80
+ ): _asdict(
81
+ v,
82
+ encode_json=encode_json,
83
+ redact_sensitive=redact_sensitive,
84
+ redacted_text=redacted_text,
85
+ )
86
+ for k, v in obj.items()
87
+ }
88
+ elif isinstance(obj, Collection) and not isinstance(obj, (str, bytes, Enum)):
89
+ return [
90
+ _asdict(
91
+ v,
92
+ encode_json=encode_json,
93
+ redact_sensitive=redact_sensitive,
94
+ redacted_text=redacted_text,
95
+ )
96
+ for v in obj
97
+ ]
98
+ else:
99
+ return copy.deepcopy(obj)
@@ -0,0 +1,54 @@
1
+ import typing as t
2
+ from dataclasses import MISSING, Field
3
+
4
+ from unstructured_ingest.enhanced_dataclass.core import _recursive_repr
5
+
6
+
7
+ class EnhancedField(Field):
8
+ def __init__(self, *args, sensitive=False, overload_name: t.Optional[str] = None):
9
+ super().__init__(*args)
10
+ self.sensitive = sensitive
11
+ self.overload_name = overload_name
12
+
13
+ @_recursive_repr
14
+ def __repr__(self):
15
+ # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
16
+ fields_array = [
17
+ f"name={self.name!r}",
18
+ f"type={self.type!r}",
19
+ f"default={self.default!r}",
20
+ f"default_factory={self.default_factory!r}",
21
+ f"init={self.init!r}",
22
+ f"repr={self.repr!r}",
23
+ f"hash={self.hash!r}",
24
+ f"compare={self.compare!r}",
25
+ f"metadata={self.metadata!r}",
26
+ f"sensitive={self.sensitive!r}",
27
+ f"overload_name={self.overload_name!r}",
28
+ f"_field_type={self._field_type}",
29
+ ]
30
+ if kw_only := getattr(self, "kw_only", None):
31
+ fields_array.append(f"kw_only={kw_only!r}")
32
+ return "Field({})".format(",".join(fields_array))
33
+
34
+
35
+ def enhanced_field(
36
+ *,
37
+ default=MISSING,
38
+ default_factory=MISSING,
39
+ init: bool = True,
40
+ repr: bool = True,
41
+ hash=None,
42
+ compare: bool = True,
43
+ metadata=None,
44
+ kw_only=MISSING,
45
+ sensitive: bool = False,
46
+ overload_name: t.Optional[str] = None,
47
+ ):
48
+ if default is not MISSING and default_factory is not MISSING:
49
+ raise ValueError("cannot specify both default and default_factory")
50
+ args = [default, default_factory, init, repr, hash, compare, metadata]
51
+ # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map
52
+ if "kw_only" in EnhancedField.__slots__:
53
+ args.append(kw_only)
54
+ return EnhancedField(*args, sensitive=sensitive, overload_name=overload_name)
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import InitVar, fields
5
+ from typing import Any, Callable, Optional, Type, TypeVar, Union
6
+
7
+ import dataclasses_json.core as dataclasses_json_core
8
+ from dataclasses_json import DataClassJsonMixin
9
+
10
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
11
+
12
+ A = TypeVar("A", bound="EnhancedDataClassJsonMixin")
13
+
14
+ # Monkey-patch _decode_dataclass class to support name override
15
+ og_decode_dataclass = dataclasses_json_core._decode_dataclass
16
+
17
+
18
+ def custom_decode_dataclass(cls, kvs, infer_missing):
19
+ dataclass_fields = fields(cls)
20
+ for f in [
21
+ field
22
+ for field in dataclass_fields
23
+ if hasattr(field, "overload_name") and getattr(field, "overload_name", None)
24
+ ]:
25
+ field_name = f.name
26
+ overload_name = getattr(f, "overload_name")
27
+ if isinstance(kvs, dict) and overload_name in kvs:
28
+ kvs[field_name] = kvs.pop(overload_name)
29
+ return og_decode_dataclass(cls, kvs, infer_missing)
30
+
31
+
32
+ dataclasses_json_core._decode_dataclass = custom_decode_dataclass
33
+
34
+
35
+ class EnhancedDataClassJsonMixin(DataClassJsonMixin):
36
+ """A mixin class extending DataClassJsonMixin.
37
+
38
+ This class extends the functionality of DataClassJsonMixin to provide enhanced functionality
39
+ for JSON serialization and deserialization. It introduces options for redacting sensitive
40
+ information, custom encoding, and more advanced schema handling.
41
+
42
+ Attributes:
43
+ N/A (No additional attributes)
44
+
45
+ Methods:
46
+ to_json: Serialize the object to JSON format with customizable options.
47
+ from_dict: Deserialize a dictionary into an object of this class.
48
+ to_dict: Convert the object to a dictionary with customizable options.
49
+ schema: Generate a schema for validating and parsing JSON data based on this class.
50
+ """
51
+
52
+ @classmethod
53
+ def check_init_var(cls):
54
+ ann = cls.__dict__.get("__annotations__", {})
55
+ init_vars = {k: v for k, v in ann.items() if isinstance(v, InitVar)}
56
+ if init_vars:
57
+ raise TypeError(
58
+ "Class {} has the following fields defined with an InitVar which "
59
+ "cannot be used with EnhancedDataClassJsonMixin: {}".format(
60
+ cls.__name__, ", ".join(init_vars.keys())
61
+ )
62
+ )
63
+
64
+ def to_json(
65
+ self,
66
+ *,
67
+ skipkeys: bool = False,
68
+ ensure_ascii: bool = True,
69
+ check_circular: bool = True,
70
+ allow_nan: bool = True,
71
+ indent: Optional[Union[int, str]] = None,
72
+ separators: Optional[tuple[str, str]] = None,
73
+ default: Optional[Callable[..., Any]] = None,
74
+ sort_keys: bool = False,
75
+ redact_sensitive: bool = False,
76
+ redacted_text: str = "***REDACTED***",
77
+ apply_name_overload: bool = True,
78
+ **kw: Any,
79
+ ) -> str:
80
+ self.check_init_var()
81
+ return json.dumps(
82
+ self.to_dict(
83
+ encode_json=False,
84
+ redact_sensitive=redact_sensitive,
85
+ redacted_text=redacted_text,
86
+ apply_name_overload=apply_name_overload,
87
+ ),
88
+ cls=dataclasses_json_core._ExtendedEncoder,
89
+ skipkeys=skipkeys,
90
+ ensure_ascii=ensure_ascii,
91
+ check_circular=check_circular,
92
+ allow_nan=allow_nan,
93
+ indent=indent,
94
+ separators=separators,
95
+ default=default,
96
+ sort_keys=sort_keys,
97
+ **kw,
98
+ )
99
+
100
+ @classmethod
101
+ def from_dict(
102
+ cls: Type[A],
103
+ kvs: dataclasses_json_core.Json,
104
+ *,
105
+ infer_missing=False,
106
+ apply_name_overload=False,
107
+ ) -> A:
108
+ cls.check_init_var()
109
+ return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing)
110
+
111
+ def to_dict(
112
+ self,
113
+ encode_json: bool = False,
114
+ redact_sensitive: bool = False,
115
+ redacted_text: str = "***REDACTED***",
116
+ apply_name_overload: bool = True,
117
+ ) -> dict[str, dataclasses_json_core.Json]:
118
+ self.check_init_var()
119
+ return _asdict(
120
+ self,
121
+ encode_json=encode_json,
122
+ redact_sensitive=redact_sensitive,
123
+ redacted_text=redacted_text,
124
+ apply_name_overload=apply_name_overload,
125
+ )
@@ -0,0 +1,49 @@
1
+ from abc import ABC
2
+ from functools import wraps
3
+
4
+
5
+ class CustomError(Exception, ABC):
6
+ error_string: str
7
+
8
+ @classmethod
9
+ def wrap(cls, f):
10
+ """
11
+ Provides a wrapper for a function that catches any exception and
12
+ re-raises it as the customer error. If the exception itself is already an instance
13
+ of the custom error, re-raises original error.
14
+ """
15
+
16
+ @wraps(f)
17
+ def wrapper(*args, **kwargs):
18
+ try:
19
+ return f(*args, **kwargs)
20
+ except BaseException as error:
21
+ if not isinstance(error, cls) and not issubclass(type(error), cls):
22
+ raise cls(cls.error_string.format(str(error))) from error
23
+ raise
24
+
25
+ return wrapper
26
+
27
+
28
+ class SourceConnectionError(CustomError):
29
+ error_string = "Error in getting data from upstream data source: {}"
30
+
31
+
32
+ class SourceConnectionNetworkError(SourceConnectionError):
33
+ error_string = "Error in connecting to upstream data source: {}"
34
+
35
+
36
+ class DestinationConnectionError(CustomError):
37
+ error_string = "Error in connecting to downstream data source: {}"
38
+
39
+
40
+ class EmbeddingEncoderConnectionError(CustomError):
41
+ error_string = "Error in connecting to the embedding model provider: {}"
42
+
43
+
44
+ class WriteError(CustomError):
45
+ error_string = "Error in writing to downstream data source: {}"
46
+
47
+
48
+ class PartitionError(CustomError):
49
+ error_string = "Error in partitioning content: {}"
@@ -0,0 +1,3 @@
1
+ from ._wrapper import RetryHandler
2
+
3
+ __all__ = ["RetryHandler"]
@@ -0,0 +1,102 @@
1
+ import logging
2
+ import sys
3
+ import traceback
4
+
5
+
6
+ # Default startup handler
7
+ def _log_start(details, logger, log_level):
8
+ max_tried = details.get("max_tries")
9
+ max_time = details.get("max_time")
10
+ if max_tried is not None and max_time is not None:
11
+ s = "%.1fs or %d tries"
12
+ s_args = [max_time, max_tried]
13
+ elif max_tried is not None:
14
+ s = "%d tries"
15
+ s_args = [max_tried]
16
+ else:
17
+ s = "%.1fs"
18
+ s_args = [max_time]
19
+ exception = details.get("exception")
20
+ if isinstance(exception, tuple):
21
+ exception = list(exception)
22
+ elif not isinstance(exception, list):
23
+ exception = [exception]
24
+ exception_s = ", ".join([e.__name__ for e in exception])
25
+ if log_level >= logging.INFO:
26
+ msg = f"Attempting %s(...), will retry for {s} given these issues: %s"
27
+ log_args = [details["target"].__name__] + s_args + [exception_s]
28
+ else:
29
+ msg = f"Attempting %s(%s), will retry for {s} given these issues: %s"
30
+ target_input_list = []
31
+ if args := details.get("args"):
32
+ target_input_list.extend([str(d) for d in args])
33
+ if kwargs := details.get("kwargs"):
34
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
35
+ target_input = ", ".join(target_input_list) if target_input_list else ""
36
+ log_args = (
37
+ [
38
+ details["target"].__name__,
39
+ target_input,
40
+ ]
41
+ + s_args
42
+ + [exception_s]
43
+ )
44
+ logger.log(log_level, msg, *log_args)
45
+
46
+
47
+ # Default backoff handler
48
+ def _log_backoff(details, logger, log_level):
49
+ if log_level >= logging.INFO:
50
+ msg = "Backing off %s(...) for %.1fs (%s)"
51
+ log_args = [details["target"].__name__, details["tries"]]
52
+ else:
53
+ msg = "Backing off %.1fs seconds after %d tries calling function %s(%s) -> %s"
54
+ target_input_list = []
55
+ if args := details.get("args"):
56
+ target_input_list.extend([str(d) for d in args])
57
+ if kwargs := details.get("kwargs"):
58
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
59
+ target_input = ", ".join(target_input_list) if target_input_list else ""
60
+ log_args = [
61
+ details["wait"],
62
+ details["tries"],
63
+ details["target"].__name__,
64
+ target_input,
65
+ ]
66
+ exc_typ, exc, _ = sys.exc_info()
67
+ if exc is not None:
68
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
69
+ log_args.append(exc_fmt.rstrip("\n"))
70
+ else:
71
+ log_args.append(str(details["value"]))
72
+ logger.log(log_level, msg, *log_args)
73
+
74
+
75
+ # Default giveup handler
76
+ def _log_giveup(details, logger, log_level):
77
+ if log_level >= logging.INFO:
78
+ msg = "Giving up %s(...) after %.1fs (%s)"
79
+ log_args = [details["target"].__name__, details["tries"]]
80
+ else:
81
+ msg = "Giving up after %d tries (%.1fs) calling function %s(%s) -> %s"
82
+ target_input_list = []
83
+ if args := details.get("args"):
84
+ target_input_list.extend([str(d) for d in args])
85
+ if kwargs := details.get("kwargs"):
86
+ target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()])
87
+ target_input = ", ".join(target_input_list) if target_input_list else "..."
88
+ log_args = [
89
+ details["tries"],
90
+ details["wait"],
91
+ details["target"].__name__,
92
+ target_input,
93
+ ]
94
+
95
+ exc_typ, exc, _ = sys.exc_info()
96
+ if exc is not None:
97
+ exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1]
98
+ log_args.append(exc_fmt.rstrip("\n"))
99
+ else:
100
+ log_args.append(details["value"])
101
+
102
+ logger.log(log_level, msg, *log_args)