unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,277 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import DestinationConnectionError
8
+ from unstructured_ingest.utils.data_prep import (
9
+ flatten_dict,
10
+ generator_batching_wbytes,
11
+ )
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ Uploader,
19
+ UploaderConfig,
20
+ UploadStager,
21
+ UploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.logger import logger
24
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
25
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
26
+
27
+ if TYPE_CHECKING:
28
+ from pinecone import Index as PineconeIndex
29
+ from pinecone import Pinecone
30
+
31
+
32
+ CONNECTOR_TYPE = "pinecone"
33
+ MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
34
+ MAX_POOL_THREADS = 100
35
+ MAX_METADATA_BYTES = 40960 # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
36
+ MAX_QUERY_RESULTS = 10000
37
+
38
+
39
+ class PineconeAccessConfig(AccessConfig):
40
+ pinecone_api_key: Optional[str] = Field(
41
+ default=None, description="API key for Pinecone.", alias="api_key"
42
+ )
43
+
44
+
45
+ class PineconeConnectionConfig(ConnectionConfig):
46
+ index_name: str = Field(description="Name of the index to connect to.")
47
+ access_config: Secret[PineconeAccessConfig] = Field(
48
+ default=PineconeAccessConfig(), validate_default=True
49
+ )
50
+
51
+ @requires_dependencies(["pinecone"], extras="pinecone")
52
+ def get_client(self, **index_kwargs) -> "Pinecone":
53
+ from pinecone import Pinecone
54
+
55
+ from unstructured_ingest import __version__ as unstructured_version
56
+
57
+ return Pinecone(
58
+ api_key=self.access_config.get_secret_value().pinecone_api_key,
59
+ source_tag=f"unstructured_ingest=={unstructured_version}",
60
+ )
61
+
62
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
63
+ pc = self.get_client()
64
+
65
+ index = pc.Index(name=self.index_name, **index_kwargs)
66
+ logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
67
+ return index
68
+
69
+
70
+ ALLOWED_FIELDS = (
71
+ "element_id",
72
+ "text",
73
+ "parent_id",
74
+ "category_depth",
75
+ "emphasized_text_tags",
76
+ "emphasized_text_contents",
77
+ "coordinates",
78
+ "last_modified",
79
+ "page_number",
80
+ "filename",
81
+ "is_continuation",
82
+ "link_urls",
83
+ "link_texts",
84
+ "text_as_html",
85
+ )
86
+
87
+
88
+ class PineconeUploadStagerConfig(UploadStagerConfig):
89
+ metadata_fields: list[str] = Field(
90
+ default=list(ALLOWED_FIELDS),
91
+ description=(
92
+ "which metadata from the source element to map to the payload metadata being sent to "
93
+ "Pinecone."
94
+ ),
95
+ )
96
+
97
+
98
+ class PineconeUploaderConfig(UploaderConfig):
99
+ batch_size: Optional[int] = Field(
100
+ default=None,
101
+ description="Optional number of records per batch. Will otherwise limit by size.",
102
+ )
103
+ pool_threads: Optional[int] = Field(
104
+ default=1, description="Optional limit on number of threads to use for upload"
105
+ )
106
+ namespace: Optional[str] = Field(
107
+ default=None,
108
+ description="The namespace to write to. If not specified, the default namespace is used",
109
+ )
110
+ record_id_key: str = Field(
111
+ default=RECORD_ID_LABEL,
112
+ description="searchable key to find entries for the same record on previous runs",
113
+ )
114
+
115
+
116
+ @dataclass
117
+ class PineconeUploadStager(UploadStager):
118
+ upload_stager_config: PineconeUploadStagerConfig = field(
119
+ default_factory=lambda: PineconeUploadStagerConfig()
120
+ )
121
+
122
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
123
+ embeddings = element_dict.pop("embeddings", None)
124
+ metadata: dict[str, Any] = element_dict.pop("metadata", {})
125
+ data_source = metadata.pop("data_source", {})
126
+ coordinates = metadata.pop("coordinates", {})
127
+ pinecone_metadata = {}
128
+ for possible_meta in [element_dict, metadata, data_source, coordinates]:
129
+ pinecone_metadata.update(
130
+ {
131
+ k: v
132
+ for k, v in possible_meta.items()
133
+ if k in self.upload_stager_config.metadata_fields
134
+ }
135
+ )
136
+
137
+ metadata = flatten_dict(
138
+ pinecone_metadata,
139
+ separator="-",
140
+ flatten_lists=True,
141
+ remove_none=True,
142
+ )
143
+ metadata_size_bytes = len(json.dumps(metadata).encode())
144
+ if metadata_size_bytes > MAX_METADATA_BYTES:
145
+ logger.info(
146
+ f"Metadata size is {metadata_size_bytes} bytes, which exceeds the limit of"
147
+ f" {MAX_METADATA_BYTES} bytes per vector. Dropping the metadata."
148
+ )
149
+ metadata = {}
150
+
151
+ metadata[RECORD_ID_LABEL] = file_data.identifier
152
+
153
+ # To support more optimal deletes, a prefix is suggested for each record:
154
+ # https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
155
+ return {
156
+ "id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
157
+ "values": embeddings,
158
+ "metadata": metadata,
159
+ }
160
+
161
+
162
+ @dataclass
163
+ class PineconeUploader(Uploader):
164
+ upload_config: PineconeUploaderConfig
165
+ connection_config: PineconeConnectionConfig
166
+ connector_type: str = CONNECTOR_TYPE
167
+
168
+ def precheck(self):
169
+ try:
170
+ self.connection_config.get_index()
171
+ except Exception as e:
172
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
173
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
174
+
175
+ def pod_delete_by_record_id(self, file_data: FileData) -> None:
176
+ logger.debug(
177
+ f"deleting any content with metadata "
178
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
179
+ f"from pinecone pod index"
180
+ )
181
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
182
+ delete_kwargs = {
183
+ "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
184
+ }
185
+ if namespace := self.upload_config.namespace:
186
+ delete_kwargs["namespace"] = namespace
187
+
188
+ resp = index.delete(**delete_kwargs)
189
+ logger.debug(
190
+ f"deleted any content with metadata "
191
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
192
+ f"from pinecone index: {resp}"
193
+ )
194
+
195
+ def serverless_delete_by_record_id(self, file_data: FileData) -> None:
196
+ logger.debug(
197
+ f"deleting any content with metadata "
198
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
199
+ f"from pinecone serverless index"
200
+ )
201
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
202
+ list_kwargs = {"prefix": f"{file_data.identifier}#"}
203
+ deleted_ids = 0
204
+ if namespace := self.upload_config.namespace:
205
+ list_kwargs["namespace"] = namespace
206
+ for ids in index.list(**list_kwargs):
207
+ deleted_ids += len(ids)
208
+ delete_kwargs = {"ids": ids}
209
+ if namespace := self.upload_config.namespace:
210
+ delete_resp = delete_kwargs["namespace"] = namespace
211
+ # delete_resp should be an empty dict if there were no errors
212
+ if delete_resp:
213
+ logger.error(f"failed to delete batch of ids: {delete_resp}")
214
+ index.delete(**delete_kwargs)
215
+ logger.info(
216
+ f"deleted {deleted_ids} records with metadata "
217
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
218
+ f"from pinecone index"
219
+ )
220
+
221
+ @requires_dependencies(["pinecone"], extras="pinecone")
222
+ def upsert_batches_async(self, elements_dict: list[dict]):
223
+ from pinecone.exceptions import PineconeApiException
224
+
225
+ chunks = list(
226
+ generator_batching_wbytes(
227
+ iterable=elements_dict,
228
+ batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
229
+ max_batch_size=self.upload_config.batch_size,
230
+ )
231
+ )
232
+ logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
233
+
234
+ max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
235
+ if self.upload_config.pool_threads:
236
+ pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
237
+ else:
238
+ pool_threads = max_pool_threads
239
+ index = self.connection_config.get_index(pool_threads=pool_threads)
240
+ with index:
241
+ upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
242
+ if namespace := self.upload_config.namespace:
243
+ for kwargs in upsert_kwargs:
244
+ kwargs["namespace"] = namespace
245
+ async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
246
+ # Wait for and retrieve responses (this raises in case of error)
247
+ try:
248
+ results = [async_result.get() for async_result in async_results]
249
+ except PineconeApiException as api_error:
250
+ raise DestinationConnectionError(f"http error: {api_error}") from api_error
251
+ logger.debug(f"results: {results}")
252
+
253
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
254
+ logger.info(
255
+ f"writing a total of {len(data)} elements via"
256
+ f" document batches to destination"
257
+ f" index named {self.connection_config.index_name}"
258
+ )
259
+ # Determine if serverless or pod based index
260
+ pinecone_client = self.connection_config.get_client()
261
+ index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
262
+ if "serverless" in index_description.get("spec"):
263
+ self.serverless_delete_by_record_id(file_data=file_data)
264
+ elif "pod" in index_description.get("spec"):
265
+ self.pod_delete_by_record_id(file_data=file_data)
266
+ else:
267
+ raise ValueError(f"unexpected spec type in index description: {index_description}")
268
+ self.upsert_batches_async(elements_dict=data)
269
+
270
+
271
+ pinecone_destination_entry = DestinationRegistryEntry(
272
+ connection_config=PineconeConnectionConfig,
273
+ uploader=PineconeUploader,
274
+ uploader_config=PineconeUploaderConfig,
275
+ upload_stager=PineconeUploadStager,
276
+ upload_stager_config=PineconeUploadStagerConfig,
277
+ )
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
8
+ from .cloud import qdrant_cloud_destination_entry
9
+ from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
10
+ from .local import qdrant_local_destination_entry
11
+ from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
12
+ from .server import qdrant_server_destination_entry
13
+
14
+ add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
15
+ add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
16
+ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-cloud"
16
+
17
+
18
+ class CloudQdrantAccessConfig(QdrantAccessConfig):
19
+ api_key: str = Field(description="Qdrant API key")
20
+
21
+
22
+ class CloudQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant Cloud")
24
+ access_config: Secret[CloudQdrantAccessConfig]
25
+
26
+ def get_client_kwargs(self) -> dict:
27
+ return {
28
+ "api_key": self.access_config.get_secret_value().api_key,
29
+ "url": self.url,
30
+ }
31
+
32
+
33
+ class CloudQdrantUploadStagerConfig(QdrantUploadStagerConfig):
34
+ pass
35
+
36
+
37
+ @dataclass
38
+ class CloudQdrantUploadStager(QdrantUploadStager):
39
+ upload_stager_config: CloudQdrantUploadStagerConfig
40
+
41
+
42
+ class CloudQdrantUploaderConfig(QdrantUploaderConfig):
43
+ pass
44
+
45
+
46
+ @dataclass
47
+ class CloudQdrantUploader(QdrantUploader):
48
+ connection_config: CloudQdrantConnectionConfig
49
+ upload_config: CloudQdrantUploaderConfig
50
+ connector_type: str = CONNECTOR_TYPE
51
+
52
+
53
+ qdrant_cloud_destination_entry = DestinationRegistryEntry(
54
+ connection_config=CloudQdrantConnectionConfig,
55
+ uploader=CloudQdrantUploader,
56
+ uploader_config=CloudQdrantUploaderConfig,
57
+ upload_stager=CloudQdrantUploadStager,
58
+ upload_stager_config=CloudQdrantUploadStagerConfig,
59
+ )
@@ -0,0 +1,58 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-local"
16
+
17
+
18
+ class LocalQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class LocalQdrantConnectionConfig(QdrantConnectionConfig):
23
+ path: str = Field(default=None, description="Persistence path for QdrantLocal.")
24
+ access_config: Secret[LocalQdrantAccessConfig] = Field(
25
+ default_factory=LocalQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {"path": self.path}
30
+
31
+
32
+ class LocalQdrantUploadStagerConfig(QdrantUploadStagerConfig):
33
+ pass
34
+
35
+
36
+ @dataclass
37
+ class LocalQdrantUploadStager(QdrantUploadStager):
38
+ upload_stager_config: LocalQdrantUploadStagerConfig
39
+
40
+
41
+ class LocalQdrantUploaderConfig(QdrantUploaderConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class LocalQdrantUploader(QdrantUploader):
47
+ connection_config: LocalQdrantConnectionConfig
48
+ upload_config: LocalQdrantUploaderConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ qdrant_local_destination_entry = DestinationRegistryEntry(
53
+ connection_config=LocalQdrantConnectionConfig,
54
+ uploader=LocalQdrantUploader,
55
+ uploader_config=LocalQdrantUploaderConfig,
56
+ upload_stager=LocalQdrantUploadStager,
57
+ upload_stager_config=LocalQdrantUploadStagerConfig,
58
+ )
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ import json
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import asynccontextmanager, contextmanager
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
11
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ FileData,
17
+ Uploader,
18
+ UploaderConfig,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ )
22
+ from unstructured_ingest.v2.logger import logger
23
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
24
+
25
+ if TYPE_CHECKING:
26
+ from qdrant_client import AsyncQdrantClient, QdrantClient
27
+
28
+
29
+ class QdrantAccessConfig(AccessConfig, ABC):
30
+ pass
31
+
32
+
33
+ class QdrantConnectionConfig(ConnectionConfig, ABC):
34
+ access_config: Secret[QdrantAccessConfig] = Field(
35
+ default_factory=QdrantAccessConfig, validate_default=True, description="Access Config"
36
+ )
37
+
38
+ @abstractmethod
39
+ def get_client_kwargs(self) -> dict:
40
+ pass
41
+
42
+ @requires_dependencies(["qdrant_client"], extras="qdrant")
43
+ @asynccontextmanager
44
+ async def get_async_client(self) -> AsyncGenerator["AsyncQdrantClient", None]:
45
+ from qdrant_client import AsyncQdrantClient
46
+
47
+ client_kwargs = self.get_client_kwargs()
48
+ client = AsyncQdrantClient(**client_kwargs)
49
+ try:
50
+ yield client
51
+ finally:
52
+ await client.close()
53
+
54
+ @requires_dependencies(["qdrant_client"], extras="qdrant")
55
+ @contextmanager
56
+ def get_client(self) -> Generator["QdrantClient", None, None]:
57
+ from qdrant_client import QdrantClient
58
+
59
+ client_kwargs = self.get_client_kwargs()
60
+ client = QdrantClient(**client_kwargs)
61
+ try:
62
+ yield client
63
+ finally:
64
+ client.close()
65
+
66
+
67
+ class QdrantUploadStagerConfig(UploadStagerConfig):
68
+ pass
69
+
70
+
71
+ @dataclass
72
+ class QdrantUploadStager(UploadStager, ABC):
73
+ upload_stager_config: QdrantUploadStagerConfig = field(
74
+ default_factory=lambda: QdrantUploadStagerConfig()
75
+ )
76
+
77
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
78
+ """Prepares dictionary in the format that Chroma requires"""
79
+ data = element_dict.copy()
80
+ return {
81
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
82
+ "vector": data.pop("embeddings", {}),
83
+ "payload": {
84
+ "text": data.pop("text", None),
85
+ "element_serialized": json.dumps(data),
86
+ **flatten_dict(
87
+ data,
88
+ separator="-",
89
+ flatten_lists=True,
90
+ ),
91
+ },
92
+ }
93
+
94
+
95
+ class QdrantUploaderConfig(UploaderConfig):
96
+ collection_name: str = Field(description="Name of the collection.")
97
+ batch_size: int = Field(default=50, description="Number of records per batch.")
98
+ num_processes: Optional[int] = Field(
99
+ default=1,
100
+ description="Optional limit on number of threads to use for upload.",
101
+ deprecated=True,
102
+ )
103
+
104
+
105
+ @dataclass
106
+ class QdrantUploader(Uploader, ABC):
107
+ upload_config: QdrantUploaderConfig
108
+ connection_config: QdrantConnectionConfig
109
+
110
+ @DestinationConnectionError.wrap
111
+ def precheck(self) -> None:
112
+ with self.connection_config.get_client() as client:
113
+ collections_response = client.get_collections()
114
+ collection_names = [c.name for c in collections_response.collections]
115
+ if self.upload_config.collection_name not in collection_names:
116
+ raise DestinationConnectionError(
117
+ "collection '{}' not found: {}".format(
118
+ self.upload_config.collection_name, ", ".join(collection_names)
119
+ )
120
+ )
121
+
122
+ def is_async(self):
123
+ return True
124
+
125
+ async def run_data_async(
126
+ self,
127
+ data: list[dict],
128
+ file_data: FileData,
129
+ **kwargs: Any,
130
+ ) -> None:
131
+ batches = list(batch_generator(data, batch_size=self.upload_config.batch_size))
132
+ logger.debug(
133
+ "Elements split into %i batches of size %i.",
134
+ len(batches),
135
+ self.upload_config.batch_size,
136
+ )
137
+ await asyncio.gather(*[self._upsert_batch(batch) for batch in batches])
138
+
139
+ async def _upsert_batch(self, batch: list[dict]) -> None:
140
+ from qdrant_client import models
141
+
142
+ points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch]
143
+ try:
144
+ logger.debug(
145
+ "Upserting %i points to the '%s' collection.",
146
+ len(points),
147
+ self.upload_config.collection_name,
148
+ )
149
+ async with self.connection_config.get_async_client() as async_client:
150
+ await async_client.upsert(
151
+ self.upload_config.collection_name, points=points, wait=True
152
+ )
153
+ except Exception as api_error:
154
+ logger.error(
155
+ "Failed to upsert points to the collection due to the following error %s", api_error
156
+ )
157
+
158
+ raise WriteError(f"Qdrant error: {api_error}") from api_error
159
+
160
+ logger.debug("Successfully upsert points to the collection.")
@@ -0,0 +1,60 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
6
+ from unstructured_ingest.v2.processes.connectors.qdrant.qdrant import (
7
+ QdrantAccessConfig,
8
+ QdrantConnectionConfig,
9
+ QdrantUploader,
10
+ QdrantUploaderConfig,
11
+ QdrantUploadStager,
12
+ QdrantUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "qdrant-server"
16
+
17
+
18
+ class ServerQdrantAccessConfig(QdrantAccessConfig):
19
+ pass
20
+
21
+
22
+ class ServerQdrantConnectionConfig(QdrantConnectionConfig):
23
+ url: str = Field(default=None, description="url of Qdrant server")
24
+ access_config: Secret[ServerQdrantAccessConfig] = Field(
25
+ default_factory=ServerQdrantAccessConfig, validate_default=True
26
+ )
27
+
28
+ def get_client_kwargs(self) -> dict:
29
+ return {
30
+ "url": self.url,
31
+ }
32
+
33
+
34
+ class ServerQdrantUploadStagerConfig(QdrantUploadStagerConfig):
35
+ pass
36
+
37
+
38
+ @dataclass
39
+ class ServerQdrantUploadStager(QdrantUploadStager):
40
+ upload_stager_config: ServerQdrantUploadStagerConfig
41
+
42
+
43
+ class ServerQdrantUploaderConfig(QdrantUploaderConfig):
44
+ pass
45
+
46
+
47
+ @dataclass
48
+ class ServerQdrantUploader(QdrantUploader):
49
+ connection_config: ServerQdrantConnectionConfig
50
+ upload_config: ServerQdrantUploaderConfig
51
+ connector_type: str = CONNECTOR_TYPE
52
+
53
+
54
+ qdrant_server_destination_entry = DestinationRegistryEntry(
55
+ connection_config=ServerQdrantConnectionConfig,
56
+ uploader=ServerQdrantUploader,
57
+ uploader_config=ServerQdrantUploaderConfig,
58
+ upload_stager=ServerQdrantUploadStager,
59
+ upload_stager_config=ServerQdrantUploadStagerConfig,
60
+ )