unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,350 @@
1
+ import asyncio
2
+ import json
3
+ import uuid
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Mapping, Optional
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import DestinationConnectionError
12
+ from unstructured_ingest.utils.data_prep import flatten_dict
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ Uploader,
19
+ UploaderConfig,
20
+ UploadStager,
21
+ UploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.logger import logger
24
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
+
26
+ BASE_URL = "https://api.vectara.io/v2"
27
+
28
+ CONNECTOR_TYPE = "vectara"
29
+
30
+
31
+ class VectaraAccessConfig(AccessConfig):
32
+ oauth_client_id: str = Field(description="Client ID")
33
+ oauth_secret: str = Field(description="Client Secret")
34
+
35
+
36
+ class VectaraConnectionConfig(ConnectionConfig):
37
+ access_config: Secret[VectaraAccessConfig]
38
+ customer_id: str
39
+ corpus_name: Optional[str] = None
40
+ corpus_key: Optional[str] = None
41
+ token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token"
42
+
43
+
44
+ class VectaraUploadStagerConfig(UploadStagerConfig):
45
+ pass
46
+
47
+
48
+ @dataclass
49
+ class VectaraUploadStager(UploadStager):
50
+ upload_stager_config: VectaraUploadStagerConfig = field(
51
+ default_factory=lambda: VectaraUploadStagerConfig()
52
+ )
53
+
54
+ @staticmethod
55
+ def conform_dict(data: dict) -> dict:
56
+ """
57
+ Prepares dictionary in the format that Vectara requires.
58
+ See more detail in https://docs.vectara.com/docs/rest-api/create-corpus-document
59
+
60
+ Select which meta-data fields to include and optionally map them to a new format.
61
+ remove the "metadata-" prefix from the keys
62
+ """
63
+ metadata_map = {
64
+ "page_number": "page_number",
65
+ "data_source-url": "url",
66
+ "filename": "filename",
67
+ "filetype": "filetype",
68
+ "last_modified": "last_modified",
69
+ "element_id": "element_id",
70
+ }
71
+ md = flatten_dict(data, separator="-", flatten_lists=True)
72
+ md = {k.replace("metadata-", ""): v for k, v in md.items()}
73
+ md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map}
74
+ return md
75
+
76
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
77
+ with input_file.open() as in_f:
78
+ elements_contents = json.load(in_f)
79
+
80
+ logger.info(
81
+ f"Extending {len(elements_contents)} json elements from content in {input_file}"
82
+ )
83
+
84
+ conformed_elements = [
85
+ {
86
+ "id": str(uuid.uuid4()),
87
+ "type": "core",
88
+ "metadata": {
89
+ "title": file_data.identifier,
90
+ },
91
+ "document_parts": [
92
+ {
93
+ "text": element.pop("text", None),
94
+ "metadata": self.conform_dict(data=element),
95
+ }
96
+ for element in elements_contents
97
+ ],
98
+ }
99
+ ]
100
+
101
+ with open(output_file, "w") as out_f:
102
+ json.dump(conformed_elements, out_f, indent=2)
103
+
104
+
105
+ class VectaraUploaderConfig(UploaderConfig):
106
+ pass
107
+
108
+
109
+ @dataclass
110
+ class VectaraUploader(Uploader):
111
+
112
+ connector_type: str = CONNECTOR_TYPE
113
+ upload_config: VectaraUploaderConfig
114
+ connection_config: VectaraConnectionConfig
115
+ _jwt_token: Optional[str] = field(init=False, default=None)
116
+ _jwt_token_expires_ts: Optional[float] = field(init=False, default=None)
117
+
118
+ def is_async(self) -> bool:
119
+ return True
120
+
121
+ def precheck(self) -> None:
122
+ try:
123
+ self._check_connection_and_corpora()
124
+ except Exception as e:
125
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
126
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
127
+
128
+ @property
129
+ async def jwt_token_async(self) -> str:
130
+ if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
131
+ self._jwt_token = await self._get_jwt_token_async()
132
+ return self._jwt_token
133
+
134
+ @property
135
+ def jwt_token(self) -> str:
136
+ if not self._jwt_token or self._jwt_token_expires_ts - datetime.now().timestamp() <= 60:
137
+ self._jwt_token = self._get_jwt_token()
138
+ return self._jwt_token
139
+
140
+ # Get Oauth2 JWT token
141
+ @requires_dependencies(["httpx"], extras="vectara")
142
+ async def _get_jwt_token_async(self) -> str:
143
+ import httpx
144
+
145
+ """Connect to the server and get a JWT token."""
146
+ token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
147
+ headers = {
148
+ "Content-Type": "application/x-www-form-urlencoded",
149
+ }
150
+ data = {
151
+ "grant_type": "client_credentials",
152
+ "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
153
+ "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
154
+ }
155
+
156
+ async with httpx.AsyncClient() as client:
157
+ response = await client.post(token_endpoint, headers=headers, data=data)
158
+ response.raise_for_status()
159
+ response_json = response.json()
160
+
161
+ request_time = datetime.now().timestamp()
162
+ self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
163
+
164
+ return response_json.get("access_token")
165
+
166
+ # Get Oauth2 JWT token
167
+ @requires_dependencies(["httpx"], extras="vectara")
168
+ def _get_jwt_token(self) -> str:
169
+ import httpx
170
+
171
+ """Connect to the server and get a JWT token."""
172
+ token_endpoint = self.connection_config.token_url.format(self.connection_config.customer_id)
173
+ headers = {
174
+ "Content-Type": "application/x-www-form-urlencoded",
175
+ }
176
+ data = {
177
+ "grant_type": "client_credentials",
178
+ "client_id": self.connection_config.access_config.get_secret_value().oauth_client_id,
179
+ "client_secret": self.connection_config.access_config.get_secret_value().oauth_secret,
180
+ }
181
+
182
+ with httpx.Client() as client:
183
+ response = client.post(token_endpoint, headers=headers, data=data)
184
+ response.raise_for_status()
185
+ response_json = response.json()
186
+
187
+ request_time = datetime.now().timestamp()
188
+ self._jwt_token_expires_ts = request_time + response_json.get("expires_in")
189
+
190
+ return response_json.get("access_token")
191
+
192
+ @DestinationConnectionError.wrap
193
+ def _check_connection_and_corpora(self) -> None:
194
+ """
195
+ Check the connection for Vectara and validate corpus exists.
196
+ - If more than one corpus with the same name exists - raise error
197
+ - If exactly one corpus exists with this name - use it.
198
+ - If does not exist - raise error.
199
+ """
200
+ # Get token if not already set
201
+ self.jwt_token
202
+
203
+ _, list_corpora_response = self._request(
204
+ http_method="GET",
205
+ endpoint="corpora",
206
+ )
207
+
208
+ if self.connection_config.corpus_name:
209
+ possible_corpora_keys_names_map = {
210
+ corpus.get("key"): corpus.get("name")
211
+ for corpus in list_corpora_response.get("corpora")
212
+ if corpus.get("name") == self.connection_config.corpus_name
213
+ }
214
+
215
+ if len(possible_corpora_keys_names_map) > 1:
216
+ raise ValueError(
217
+ f"Multiple Corpus exist with name {self.connection_config.corpus_name} in dest."
218
+ )
219
+ if len(possible_corpora_keys_names_map) == 1:
220
+ if not self.connection_config.corpus_key:
221
+ self.connection_config.corpus_key = list(
222
+ possible_corpora_keys_names_map.keys()
223
+ )[0]
224
+ elif (
225
+ self.connection_config.corpus_key
226
+ != list(possible_corpora_keys_names_map.keys())[0]
227
+ ):
228
+ raise ValueError("Corpus key does not match provided corpus name.")
229
+ else:
230
+ raise ValueError(
231
+ f"No Corpora exist with name {self.connection_config.corpus_name} in dest."
232
+ )
233
+
234
+ @requires_dependencies(["httpx"], extras="vectara")
235
+ async def _async_request(
236
+ self,
237
+ endpoint: str,
238
+ http_method: str = "POST",
239
+ params: Mapping[str, Any] = None,
240
+ data: Mapping[str, Any] = None,
241
+ ) -> tuple[bool, dict]:
242
+ import httpx
243
+
244
+ url = f"{BASE_URL}/{endpoint}"
245
+
246
+ headers = {
247
+ "Content-Type": "application/json",
248
+ "Accept": "application/json",
249
+ "Authorization": f"Bearer {await self.jwt_token_async}",
250
+ "X-source": "unstructured",
251
+ }
252
+
253
+ async with httpx.AsyncClient() as client:
254
+ response = await client.request(
255
+ method=http_method, url=url, headers=headers, params=params, json=data
256
+ )
257
+ response.raise_for_status()
258
+ return response.json()
259
+
260
+ @requires_dependencies(["httpx"], extras="vectara")
261
+ def _request(
262
+ self,
263
+ endpoint: str,
264
+ http_method: str = "POST",
265
+ params: Mapping[str, Any] = None,
266
+ data: Mapping[str, Any] = None,
267
+ ) -> tuple[bool, dict]:
268
+ import httpx
269
+
270
+ url = f"{BASE_URL}/{endpoint}"
271
+
272
+ headers = {
273
+ "Content-Type": "application/json",
274
+ "Accept": "application/json",
275
+ "Authorization": f"Bearer {self.jwt_token}",
276
+ "X-source": "unstructured",
277
+ }
278
+
279
+ with httpx.Client() as client:
280
+ response = client.request(
281
+ method=http_method, url=url, headers=headers, params=params, json=data
282
+ )
283
+ response.raise_for_status()
284
+ return response.json()
285
+
286
+ async def _delete_doc(self, doc_id: str) -> tuple[bool, dict]:
287
+ """
288
+ Delete a document from the Vectara corpus.
289
+ """
290
+
291
+ return await self._async_request(
292
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents/{doc_id}",
293
+ http_method="DELETE",
294
+ )
295
+
296
+ async def _index_document(self, document: Dict[str, Any]) -> None:
297
+ """
298
+ Index a document (by uploading it to the Vectara corpus) from the document dictionary
299
+ """
300
+
301
+ logger.debug(
302
+ f"Indexing document {document['id']} to corpus key {self.connection_config.corpus_key}"
303
+ )
304
+
305
+ try:
306
+ result = await self._async_request(
307
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
308
+ )
309
+ except Exception as e:
310
+ logger.error(f"exception {e} while indexing document {document['id']}")
311
+ return
312
+
313
+ if (
314
+ "messages" in result
315
+ and result["messages"]
316
+ and (
317
+ "ALREADY_EXISTS" in result["messages"]
318
+ or (
319
+ "CONFLICT: Indexing doesn't support updating documents."
320
+ in result["messages"][0]
321
+ )
322
+ )
323
+ ):
324
+ logger.info(f"document {document['id']} already exists, re-indexing")
325
+ await self._delete_doc(document["id"])
326
+ await self._async_request(
327
+ endpoint=f"corpora/{self.connection_config.corpus_key}/documents", data=document
328
+ )
329
+ return
330
+
331
+ logger.info(f"indexing document {document['id']} succeeded")
332
+
333
+ async def run_data_async(
334
+ self,
335
+ data: list[dict],
336
+ file_data: FileData,
337
+ **kwargs: Any,
338
+ ) -> None:
339
+
340
+ logger.info(f"inserting / updating {len(data)} documents to Vectara ")
341
+ await asyncio.gather(*(self._index_document(vdoc) for vdoc in data))
342
+
343
+
344
+ vectara_destination_entry = DestinationRegistryEntry(
345
+ connection_config=VectaraConnectionConfig,
346
+ uploader=VectaraUploader,
347
+ uploader_config=VectaraUploaderConfig,
348
+ upload_stager=VectaraUploadStager,
349
+ upload_stager_config=VectaraUploadStagerConfig,
350
+ )
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_WEAVIATE_CONNECTOR_TYPE
8
+ from .cloud import weaviate_cloud_destination_entry
9
+ from .embedded import CONNECTOR_TYPE as EMBEDDED_WEAVIATE_CONNECTOR_TYPE
10
+ from .embedded import weaviate_embedded_destination_entry
11
+ from .local import CONNECTOR_TYPE as LOCAL_WEAVIATE_CONNECTOR_TYPE
12
+ from .local import weaviate_local_destination_entry
13
+
14
+ add_destination_entry(
15
+ destination_type=LOCAL_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_local_destination_entry
16
+ )
17
+ add_destination_entry(
18
+ destination_type=CLOUD_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_cloud_destination_entry
19
+ )
20
+ add_destination_entry(
21
+ destination_type=EMBEDDED_WEAVIATE_CONNECTOR_TYPE, entry=weaviate_embedded_destination_entry
22
+ )
@@ -0,0 +1,165 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.auth import AuthCredentials
20
+ from weaviate.client import WeaviateClient
21
+
22
+ CONNECTOR_TYPE = "weaviate-cloud"
23
+
24
+
25
+ class CloudWeaviateAccessConfig(WeaviateAccessConfig):
26
+ access_token: Optional[str] = Field(
27
+ default=None, description="Used to create the bearer token."
28
+ )
29
+ api_key: Optional[str] = None
30
+ client_secret: Optional[str] = None
31
+ password: Optional[str] = None
32
+
33
+
34
+ class CloudWeaviateConnectionConfig(WeaviateConnectionConfig):
35
+ cluster_url: str = Field(
36
+ description="The WCD cluster URL or hostname to connect to. "
37
+ "Usually in the form: rAnD0mD1g1t5.something.weaviate.cloud"
38
+ )
39
+ username: Optional[str] = None
40
+ anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
41
+ refresh_token: Optional[str] = Field(
42
+ default=None,
43
+ description="Will tie this value to the bearer token. If not provided, "
44
+ "the authentication will expire once the lifetime of the access token is up.",
45
+ )
46
+ access_config: Secret[CloudWeaviateAccessConfig]
47
+
48
+ def model_post_init(self, __context: Any) -> None:
49
+ if self.anonymous:
50
+ return
51
+ access_config = self.access_config.get_secret_value()
52
+ auths = {
53
+ "api_key": access_config.api_key is not None,
54
+ "bearer_token": access_config.access_token is not None,
55
+ "client_secret": access_config.client_secret is not None,
56
+ "client_password": access_config.password is not None and self.username is not None,
57
+ }
58
+ existing_auths = [auth_method for auth_method, flag in auths.items() if flag]
59
+
60
+ if len(existing_auths) == 0:
61
+ raise ValueError("No auth values provided and anonymous is False")
62
+ if len(existing_auths) > 1:
63
+ raise ValueError(
64
+ "Multiple auth values provided, only one approach can be used: {}".format(
65
+ ", ".join(existing_auths)
66
+ )
67
+ )
68
+
69
+ @requires_dependencies(["weaviate"], extras="weaviate")
70
+ def get_api_key_auth(self) -> Optional["AuthCredentials"]:
71
+ from weaviate.classes.init import Auth
72
+
73
+ if api_key := self.access_config.get_secret_value().api_key:
74
+ return Auth.api_key(api_key=api_key)
75
+ return None
76
+
77
+ @requires_dependencies(["weaviate"], extras="weaviate")
78
+ def get_bearer_token_auth(self) -> Optional["AuthCredentials"]:
79
+ from weaviate.classes.init import Auth
80
+
81
+ if access_token := self.access_config.get_secret_value().access_token:
82
+ return Auth.bearer_token(access_token=access_token, refresh_token=self.refresh_token)
83
+ return None
84
+
85
+ @requires_dependencies(["weaviate"], extras="weaviate")
86
+ def get_client_secret_auth(self) -> Optional["AuthCredentials"]:
87
+ from weaviate.classes.init import Auth
88
+
89
+ if client_secret := self.access_config.get_secret_value().client_secret:
90
+ return Auth.client_credentials(client_secret=client_secret)
91
+ return None
92
+
93
+ @requires_dependencies(["weaviate"], extras="weaviate")
94
+ def get_client_password_auth(self) -> Optional["AuthCredentials"]:
95
+ from weaviate.classes.init import Auth
96
+
97
+ if (username := self.username) and (
98
+ password := self.access_config.get_secret_value().password
99
+ ):
100
+ return Auth.client_password(username=username, password=password)
101
+ return None
102
+
103
+ @requires_dependencies(["weaviate"], extras="weaviate")
104
+ def get_auth(self) -> "AuthCredentials":
105
+ auths = [
106
+ self.get_api_key_auth(),
107
+ self.get_client_secret_auth(),
108
+ self.get_bearer_token_auth(),
109
+ self.get_client_password_auth(),
110
+ ]
111
+ auths = [auth for auth in auths if auth]
112
+ if len(auths) == 0:
113
+ raise ValueError("No auth values provided and anonymous is False")
114
+ if len(auths) > 1:
115
+ raise ValueError("Multiple auth values provided, only one approach can be used")
116
+ return auths[0]
117
+
118
+ @contextmanager
119
+ @requires_dependencies(["weaviate"], extras="weaviate")
120
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
121
+ from weaviate import connect_to_weaviate_cloud
122
+ from weaviate.classes.init import AdditionalConfig
123
+
124
+ auth_credentials = None if self.anonymous else self.get_auth()
125
+ with connect_to_weaviate_cloud(
126
+ cluster_url=self.cluster_url,
127
+ auth_credentials=auth_credentials,
128
+ additional_config=AdditionalConfig(timeout=self.get_timeout()),
129
+ ) as weaviate_client:
130
+ yield weaviate_client
131
+
132
+
133
+ class CloudWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
134
+ pass
135
+
136
+
137
+ @dataclass
138
+ class CloudWeaviateUploadStager(WeaviateUploadStager):
139
+ upload_stager_config: CloudWeaviateUploadStagerConfig = field(
140
+ default_factory=lambda: WeaviateUploadStagerConfig()
141
+ )
142
+
143
+
144
+ class CloudWeaviateUploaderConfig(WeaviateUploaderConfig):
145
+ pass
146
+
147
+
148
+ @dataclass
149
+ class CloudWeaviateUploader(WeaviateUploader):
150
+ connection_config: CloudWeaviateConnectionConfig = field(
151
+ default_factory=lambda: CloudWeaviateConnectionConfig()
152
+ )
153
+ upload_config: CloudWeaviateUploaderConfig = field(
154
+ default_factory=lambda: CloudWeaviateUploaderConfig()
155
+ )
156
+ connector_type: str = CONNECTOR_TYPE
157
+
158
+
159
+ weaviate_cloud_destination_entry = DestinationRegistryEntry(
160
+ connection_config=CloudWeaviateConnectionConfig,
161
+ uploader=CloudWeaviateUploader,
162
+ uploader_config=CloudWeaviateUploaderConfig,
163
+ upload_stager=CloudWeaviateUploadStager,
164
+ upload_stager_config=CloudWeaviateUploadStagerConfig,
165
+ )
@@ -0,0 +1,90 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-embedded"
22
+
23
+
24
+ class EmbeddedWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class EmbeddedWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ hostname: str = Field(default="127.0.0.1", description="hostname")
30
+ port: int = Field(default=8079, description="http port")
31
+ grpc_port: int = Field(default=50050, description="grpc port")
32
+ data_path: Optional[str] = Field(
33
+ default=None,
34
+ description="directory where the files making up the "
35
+ "database are stored. If not provided, will "
36
+ "default to underlying SDK implementation",
37
+ )
38
+ access_config: Secret[WeaviateAccessConfig] = Field(
39
+ default=WeaviateAccessConfig(), validate_default=True
40
+ )
41
+
42
+ @contextmanager
43
+ @requires_dependencies(["weaviate"], extras="weaviate")
44
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
45
+ from weaviate import connect_to_embedded
46
+ from weaviate.classes.init import AdditionalConfig
47
+
48
+ with connect_to_embedded(
49
+ hostname=self.hostname,
50
+ port=self.port,
51
+ grpc_port=self.grpc_port,
52
+ persistence_data_path=self.data_path,
53
+ additional_config=AdditionalConfig(timeout=self.get_timeout()),
54
+ ) as weaviate_client:
55
+ yield weaviate_client
56
+
57
+
58
+ class EmbeddedWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class EmbeddedWeaviateUploadStager(WeaviateUploadStager):
64
+ upload_stager_config: EmbeddedWeaviateUploadStagerConfig = field(
65
+ default_factory=lambda: WeaviateUploadStagerConfig()
66
+ )
67
+
68
+
69
+ class EmbeddedWeaviateUploaderConfig(WeaviateUploaderConfig):
70
+ pass
71
+
72
+
73
+ @dataclass
74
+ class EmbeddedWeaviateUploader(WeaviateUploader):
75
+ connection_config: EmbeddedWeaviateConnectionConfig = field(
76
+ default_factory=lambda: EmbeddedWeaviateConnectionConfig()
77
+ )
78
+ upload_config: EmbeddedWeaviateUploaderConfig = field(
79
+ default_factory=lambda: EmbeddedWeaviateUploaderConfig()
80
+ )
81
+ connector_type: str = CONNECTOR_TYPE
82
+
83
+
84
+ weaviate_embedded_destination_entry = DestinationRegistryEntry(
85
+ connection_config=EmbeddedWeaviateConnectionConfig,
86
+ uploader=EmbeddedWeaviateUploader,
87
+ uploader_config=EmbeddedWeaviateUploaderConfig,
88
+ upload_stager=EmbeddedWeaviateUploadStager,
89
+ upload_stager_config=EmbeddedWeaviateUploadStagerConfig,
90
+ )
@@ -0,0 +1,73 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Generator
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
9
+ from unstructured_ingest.v2.processes.connectors.weaviate.weaviate import (
10
+ WeaviateAccessConfig,
11
+ WeaviateConnectionConfig,
12
+ WeaviateUploader,
13
+ WeaviateUploaderConfig,
14
+ WeaviateUploadStager,
15
+ WeaviateUploadStagerConfig,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from weaviate.client import WeaviateClient
20
+
21
+ CONNECTOR_TYPE = "weaviate-local"
22
+
23
+
24
+ class LocalWeaviateAccessConfig(WeaviateAccessConfig):
25
+ pass
26
+
27
+
28
+ class LocalWeaviateConnectionConfig(WeaviateConnectionConfig):
29
+ access_config: Secret[WeaviateAccessConfig] = Field(
30
+ default=WeaviateAccessConfig(), validate_default=True
31
+ )
32
+
33
+ @contextmanager
34
+ @requires_dependencies(["weaviate"], extras="weaviate")
35
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
36
+ from weaviate import connect_to_local
37
+ from weaviate.classes.init import AdditionalConfig
38
+
39
+ with connect_to_local(
40
+ additional_config=AdditionalConfig(timeout=self.get_timeout())
41
+ ) as weaviate_client:
42
+ yield weaviate_client
43
+
44
+
45
+ class LocalWeaviateUploadStagerConfig(WeaviateUploadStagerConfig):
46
+ pass
47
+
48
+
49
+ @dataclass
50
+ class LocalWeaviateUploadStager(WeaviateUploadStager):
51
+ upload_stager_config: LocalWeaviateUploadStagerConfig = field(
52
+ default_factory=lambda: WeaviateUploadStagerConfig()
53
+ )
54
+
55
+
56
+ class LocalWeaviateUploaderConfig(WeaviateUploaderConfig):
57
+ pass
58
+
59
+
60
+ @dataclass
61
+ class LocalWeaviateUploader(WeaviateUploader):
62
+ upload_config: LocalWeaviateUploaderConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+ connection_config: LocalWeaviateConnectionConfig
65
+
66
+
67
+ weaviate_local_destination_entry = DestinationRegistryEntry(
68
+ connection_config=LocalWeaviateConnectionConfig,
69
+ uploader=LocalWeaviateUploader,
70
+ uploader_config=LocalWeaviateUploaderConfig,
71
+ upload_stager=LocalWeaviateUploadStager,
72
+ upload_stager_config=LocalWeaviateUploadStagerConfig,
73
+ )