unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,470 @@
1
+ import collections
2
+ import hashlib
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
8
+
9
+ from pydantic import BaseModel, Field, Secret, SecretStr
10
+
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ WriteError,
16
+ )
17
+ from unstructured_ingest.utils.data_prep import (
18
+ batch_generator,
19
+ flatten_dict,
20
+ generator_batching_wbytes,
21
+ )
22
+ from unstructured_ingest.utils.dep_check import requires_dependencies
23
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
24
+ from unstructured_ingest.v2.interfaces import (
25
+ AccessConfig,
26
+ BatchFileData,
27
+ BatchItem,
28
+ ConnectionConfig,
29
+ Downloader,
30
+ DownloaderConfig,
31
+ DownloadResponse,
32
+ FileData,
33
+ FileDataSourceMetadata,
34
+ Indexer,
35
+ IndexerConfig,
36
+ SourceIdentifiers,
37
+ Uploader,
38
+ UploaderConfig,
39
+ UploadStager,
40
+ UploadStagerConfig,
41
+ download_responses,
42
+ )
43
+ from unstructured_ingest.v2.logger import logger
44
+ from unstructured_ingest.v2.processes.connector_registry import (
45
+ DestinationRegistryEntry,
46
+ SourceRegistryEntry,
47
+ )
48
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
49
+
50
+ if TYPE_CHECKING:
51
+ from elasticsearch import Elasticsearch as ElasticsearchClient
52
+
53
+ CONNECTOR_TYPE = "elasticsearch"
54
+
55
+
56
+ class ElastisearchAdditionalMetadata(BaseModel):
57
+ index_name: str
58
+
59
+
60
+ class ElasticsearchBatchFileData(BatchFileData):
61
+ additional_metadata: ElastisearchAdditionalMetadata
62
+
63
+
64
+ class ElasticsearchAccessConfig(AccessConfig):
65
+ password: Optional[str] = Field(
66
+ default=None, description="password when using basic auth or connecting to a cloud instance"
67
+ )
68
+ es_api_key: Optional[str] = Field(default=None, description="api key used for authentication")
69
+ bearer_auth: Optional[str] = Field(
70
+ default=None, description="bearer token used for HTTP bearer authentication"
71
+ )
72
+ ssl_assert_fingerprint: Optional[str] = Field(
73
+ default=None, description="SHA256 fingerprint value"
74
+ )
75
+
76
+
77
+ class ElasticsearchClientInput(BaseModel):
78
+ hosts: Optional[list[str]] = None
79
+ cloud_id: Optional[str] = None
80
+ ca_certs: Optional[Path] = None
81
+ basic_auth: Optional[Secret[tuple[str, str]]] = None
82
+ api_key: Optional[Union[Secret[tuple[str, str]], SecretStr]] = None
83
+
84
+
85
+ class ElasticsearchConnectionConfig(ConnectionConfig):
86
+ hosts: Optional[list[str]] = Field(
87
+ default=None,
88
+ description="list of the Elasticsearch hosts to connect to",
89
+ examples=["http://localhost:9200"],
90
+ )
91
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
92
+ cloud_id: Optional[str] = Field(default=None, description="id used to connect to Elastic Cloud")
93
+ api_key_id: Optional[str] = Field(
94
+ default=None,
95
+ description="id associated with api key used for authentication: "
96
+ "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
97
+ )
98
+ ca_certs: Optional[Path] = None
99
+ access_config: Secret[ElasticsearchAccessConfig]
100
+
101
+ def get_client_kwargs(self) -> dict:
102
+ # Update auth related fields to conform to what the SDK expects based on the
103
+ # supported methods:
104
+ # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html
105
+ client_input_kwargs: dict[str, Any] = {}
106
+ access_config = self.access_config.get_secret_value()
107
+ if self.hosts:
108
+ client_input_kwargs["hosts"] = self.hosts
109
+ if self.cloud_id:
110
+ client_input_kwargs["cloud_id"] = self.cloud_id
111
+ if self.ca_certs:
112
+ client_input_kwargs["ca_certs"] = self.ca_certs
113
+ if access_config.password and (
114
+ self.cloud_id or self.ca_certs or access_config.ssl_assert_fingerprint
115
+ ):
116
+ client_input_kwargs["basic_auth"] = ("elastic", access_config.password)
117
+ elif not self.cloud_id and self.username and access_config.password:
118
+ client_input_kwargs["basic_auth"] = (self.username, access_config.password)
119
+ elif access_config.es_api_key and self.api_key_id:
120
+ client_input_kwargs["api_key"] = (self.api_key_id, access_config.es_api_key)
121
+ elif access_config.es_api_key:
122
+ client_input_kwargs["api_key"] = access_config.es_api_key
123
+ client_input = ElasticsearchClientInput(**client_input_kwargs)
124
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
125
+ client_kwargs = client_input.model_dump()
126
+ client_kwargs["basic_auth"] = (
127
+ client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
128
+ )
129
+ client_kwargs["api_key"] = (
130
+ client_input.api_key.get_secret_value() if client_input.api_key else None
131
+ )
132
+ client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
133
+ return client_kwargs
134
+
135
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
136
+ @contextmanager
137
+ def get_client(self) -> Generator["ElasticsearchClient", None, None]:
138
+ from elasticsearch import Elasticsearch as ElasticsearchClient
139
+
140
+ with ElasticsearchClient(**self.get_client_kwargs()) as client:
141
+ yield client
142
+
143
+
144
+ class ElasticsearchIndexerConfig(IndexerConfig):
145
+ index_name: str
146
+ batch_size: int = 100
147
+
148
+
149
+ @dataclass
150
+ class ElasticsearchIndexer(Indexer):
151
+ connection_config: ElasticsearchConnectionConfig
152
+ index_config: ElasticsearchIndexerConfig
153
+ connector_type: str = CONNECTOR_TYPE
154
+
155
+ def precheck(self) -> None:
156
+ try:
157
+ with self.connection_config.get_client() as client:
158
+ indices = client.indices.get_alias(index="*")
159
+ if self.index_config.index_name not in indices:
160
+ raise SourceConnectionError(
161
+ "index {} not found: {}".format(
162
+ self.index_config.index_name, ", ".join(indices.keys())
163
+ )
164
+ )
165
+ except Exception as e:
166
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
167
+ raise SourceConnectionError(f"failed to validate connection: {e}")
168
+
169
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
170
+ def load_scan(self):
171
+ from elasticsearch.helpers import scan
172
+
173
+ return scan
174
+
175
+ def _get_doc_ids(self) -> set[str]:
176
+ """Fetches all document ids in an index"""
177
+ scan = self.load_scan()
178
+
179
+ scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
180
+ with self.connection_config.get_client() as client:
181
+ hits = scan(
182
+ client,
183
+ query=scan_query,
184
+ scroll="1m",
185
+ index=self.index_config.index_name,
186
+ )
187
+
188
+ return {hit["_id"] for hit in hits}
189
+
190
+ def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
191
+ all_ids = self._get_doc_ids()
192
+ ids = list(all_ids)
193
+ for batch in batch_generator(ids, self.index_config.batch_size):
194
+ # Make sure the hash is always a positive number to create identified
195
+ yield ElasticsearchBatchFileData(
196
+ connector_type=CONNECTOR_TYPE,
197
+ metadata=FileDataSourceMetadata(
198
+ url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
199
+ date_processed=str(time()),
200
+ ),
201
+ additional_metadata=ElastisearchAdditionalMetadata(
202
+ index_name=self.index_config.index_name,
203
+ ),
204
+ batch_items=[BatchItem(identifier=b) for b in batch],
205
+ )
206
+
207
+
208
+ class ElasticsearchDownloaderConfig(DownloaderConfig):
209
+ fields: list[str] = field(default_factory=list)
210
+
211
+
212
+ @dataclass
213
+ class ElasticsearchDownloader(Downloader):
214
+ connection_config: ElasticsearchConnectionConfig
215
+ download_config: ElasticsearchDownloaderConfig
216
+ connector_type: str = CONNECTOR_TYPE
217
+
218
+ def is_async(self) -> bool:
219
+ return True
220
+
221
+ def get_identifier(self, index_name: str, record_id: str) -> str:
222
+ f = f"{index_name}-{record_id}"
223
+ if self.download_config.fields:
224
+ f = "{}-{}".format(
225
+ f,
226
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
227
+ )
228
+ return f
229
+
230
+ def map_es_results(self, es_results: dict) -> str:
231
+ doc_body = es_results["_source"]
232
+ flattened_dict = flatten_dict(dictionary=doc_body)
233
+ str_values = [str(value) for value in flattened_dict.values()]
234
+ concatenated_values = "\n".join(str_values)
235
+ return concatenated_values
236
+
237
+ def generate_download_response(
238
+ self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
239
+ ) -> DownloadResponse:
240
+ record_id = result["_id"]
241
+ filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
242
+ filename = f"{filename_id}.txt"
243
+ download_path = self.download_dir / Path(filename)
244
+ logger.debug(
245
+ f"Downloading results from index {index_name} and id {record_id} to {download_path}"
246
+ )
247
+ download_path.parent.mkdir(parents=True, exist_ok=True)
248
+ try:
249
+ with open(download_path, "w", encoding="utf8") as f:
250
+ f.write(self.map_es_results(es_results=result))
251
+ except Exception as e:
252
+ logger.error(
253
+ f"failed to download from index {index_name} "
254
+ f"and id {record_id} to {download_path}: {e}",
255
+ exc_info=True,
256
+ )
257
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
258
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
259
+ cast_file_data = FileData.cast(file_data=file_data)
260
+ cast_file_data.identifier = filename_id
261
+ cast_file_data.metadata.date_processed = str(time())
262
+ cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
263
+ cast_file_data.metadata.record_locator = {
264
+ "hosts": self.connection_config.hosts,
265
+ "index_name": index_name,
266
+ "document_id": record_id,
267
+ }
268
+ return super().generate_download_response(
269
+ file_data=cast_file_data,
270
+ download_path=download_path,
271
+ )
272
+
273
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
274
+ raise NotImplementedError()
275
+
276
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
277
+ def load_async(self):
278
+ from elasticsearch import AsyncElasticsearch
279
+ from elasticsearch.helpers import async_scan
280
+
281
+ return AsyncElasticsearch, async_scan
282
+
283
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
284
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
285
+ AsyncClient, async_scan = self.load_async()
286
+
287
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
288
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
289
+
290
+ scan_query = {
291
+ "_source": self.download_config.fields,
292
+ "version": True,
293
+ "query": {"ids": {"values": ids}},
294
+ }
295
+
296
+ download_responses = []
297
+ async with AsyncClient(**self.connection_config.get_client_kwargs()) as client:
298
+ async for result in async_scan(
299
+ client,
300
+ query=scan_query,
301
+ scroll="1m",
302
+ index=index_name,
303
+ ):
304
+ download_responses.append(
305
+ self.generate_download_response(
306
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
307
+ )
308
+ )
309
+ return download_responses
310
+
311
+
312
+ class ElasticsearchUploadStagerConfig(UploadStagerConfig):
313
+ index_name: str = Field(
314
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
315
+ )
316
+
317
+
318
+ @dataclass
319
+ class ElasticsearchUploadStager(UploadStager):
320
+ upload_stager_config: ElasticsearchUploadStagerConfig
321
+
322
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
323
+ data = element_dict.copy()
324
+ resp = {
325
+ "_index": self.upload_stager_config.index_name,
326
+ "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
327
+ "_source": {
328
+ "element_id": data.pop("element_id", None),
329
+ "embeddings": data.pop("embeddings", None),
330
+ "text": data.pop("text", None),
331
+ "type": data.pop("type", None),
332
+ RECORD_ID_LABEL: file_data.identifier,
333
+ },
334
+ }
335
+ if "metadata" in data and isinstance(data["metadata"], dict):
336
+ resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
337
+ return resp
338
+
339
+
340
+ class ElasticsearchUploaderConfig(UploaderConfig):
341
+ index_name: str = Field(
342
+ description="Name of the Elasticsearch index to pull data from, or upload data to."
343
+ )
344
+ batch_size_bytes: int = Field(
345
+ default=15_000_000,
346
+ description="Size limit (in bytes) for each batch of items to be uploaded. Check"
347
+ " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
348
+ "#_how_big_is_too_big for more information.",
349
+ )
350
+ num_threads: int = Field(
351
+ default=4, description="Number of threads to be used while uploading content"
352
+ )
353
+ record_id_key: str = Field(
354
+ default=RECORD_ID_LABEL,
355
+ description="searchable key to find entries for the same record on previous runs",
356
+ )
357
+
358
+
359
+ @dataclass
360
+ class ElasticsearchUploader(Uploader):
361
+ connector_type: str = CONNECTOR_TYPE
362
+ upload_config: ElasticsearchUploaderConfig
363
+ connection_config: ElasticsearchConnectionConfig
364
+
365
+ def precheck(self) -> None:
366
+ try:
367
+ with self.connection_config.get_client() as client:
368
+ indices = client.indices.get_alias(index="*")
369
+ if self.upload_config.index_name not in indices:
370
+ raise DestinationConnectionError(
371
+ "index {} not found: {}".format(
372
+ self.upload_config.index_name, ", ".join(indices.keys())
373
+ )
374
+ )
375
+ except Exception as e:
376
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
377
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
378
+
379
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
380
+ def load_parallel_bulk(self):
381
+ from elasticsearch.helpers import parallel_bulk
382
+
383
+ return parallel_bulk
384
+
385
+ def delete_by_record_id(self, client, file_data: FileData) -> None:
386
+ logger.debug(
387
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
388
+ f"from {self.upload_config.index_name} index"
389
+ )
390
+ delete_resp = client.delete_by_query(
391
+ index=self.upload_config.index_name,
392
+ body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
393
+ )
394
+ logger.info(
395
+ "deleted {} records from index {}".format(
396
+ delete_resp["deleted"], self.upload_config.index_name
397
+ )
398
+ )
399
+ if failures := delete_resp.get("failures"):
400
+ raise WriteError(f"failed to delete records: {failures}")
401
+
402
+ @requires_dependencies(["elasticsearch"], extras="elasticsearch")
403
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None: # noqa: E501
404
+ from elasticsearch.helpers.errors import BulkIndexError
405
+
406
+ parallel_bulk = self.load_parallel_bulk()
407
+ upload_destination = self.connection_config.hosts or self.connection_config.cloud_id
408
+
409
+ logger.info(
410
+ f"writing {len(data)} elements via document batches to destination "
411
+ f"index named {self.upload_config.index_name} at {upload_destination} with "
412
+ f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
413
+ f"{self.upload_config.num_threads} (number of) threads"
414
+ )
415
+
416
+ with self.connection_config.get_client() as client:
417
+ self.delete_by_record_id(client=client, file_data=file_data)
418
+ if not client.indices.exists(index=self.upload_config.index_name):
419
+ logger.warning(
420
+ f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
421
+ f"{self.upload_config.index_name}. "
422
+ f"This may cause issues when uploading."
423
+ )
424
+ for batch in generator_batching_wbytes(
425
+ data, batch_size_limit_bytes=self.upload_config.batch_size_bytes
426
+ ):
427
+ try:
428
+ iterator = parallel_bulk(
429
+ client=client,
430
+ actions=batch,
431
+ thread_count=self.upload_config.num_threads,
432
+ )
433
+ collections.deque(iterator, maxlen=0)
434
+ except BulkIndexError as e:
435
+ sanitized_errors = [
436
+ self._sanitize_bulk_index_error(error) for error in e.errors
437
+ ]
438
+ logger.error(
439
+ f"Batch upload failed - {e} - with following errors: {sanitized_errors}"
440
+ )
441
+ raise e
442
+ except Exception as e:
443
+ logger.error(f"Batch upload failed - {e}")
444
+ raise e
445
+
446
+ def _sanitize_bulk_index_error(self, error: dict[str, dict]) -> dict:
447
+ """Remove data uploaded to index from the log, leave only error information.
448
+
449
+ Error structure is `{<operation-type>: {..., "data": <uploaded-object>}}`
450
+ """
451
+ for error_data in error.values():
452
+ error_data.pop("data", None)
453
+ return error
454
+
455
+
456
+ elasticsearch_source_entry = SourceRegistryEntry(
457
+ connection_config=ElasticsearchConnectionConfig,
458
+ indexer=ElasticsearchIndexer,
459
+ indexer_config=ElasticsearchIndexerConfig,
460
+ downloader=ElasticsearchDownloader,
461
+ downloader_config=ElasticsearchDownloaderConfig,
462
+ )
463
+
464
+ elasticsearch_destination_entry = DestinationRegistryEntry(
465
+ connection_config=ElasticsearchConnectionConfig,
466
+ upload_stager_config=ElasticsearchUploadStagerConfig,
467
+ upload_stager=ElasticsearchUploadStager,
468
+ uploader_config=ElasticsearchUploaderConfig,
469
+ uploader=ElasticsearchUploader,
470
+ )
@@ -0,0 +1,195 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Optional
4
+
5
+ from pydantic import BaseModel, Field, Secret
6
+
7
+ from unstructured_ingest.error import (
8
+ DestinationConnectionError,
9
+ )
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.interfaces import (
12
+ AccessConfig,
13
+ ConnectionConfig,
14
+ )
15
+ from unstructured_ingest.v2.logger import logger
16
+ from unstructured_ingest.v2.processes.connector_registry import (
17
+ DestinationRegistryEntry,
18
+ SourceRegistryEntry,
19
+ )
20
+ from unstructured_ingest.v2.processes.connectors.elasticsearch.elasticsearch import (
21
+ ElasticsearchDownloader,
22
+ ElasticsearchDownloaderConfig,
23
+ ElasticsearchIndexer,
24
+ ElasticsearchIndexerConfig,
25
+ ElasticsearchUploader,
26
+ ElasticsearchUploaderConfig,
27
+ ElasticsearchUploadStager,
28
+ ElasticsearchUploadStagerConfig,
29
+ )
30
+
31
+ if TYPE_CHECKING:
32
+ from opensearchpy import OpenSearch
33
+
34
+ CONNECTOR_TYPE = "opensearch"
35
+
36
+ """Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
37
+ heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
38
+
39
+
40
+ class OpenSearchAccessConfig(AccessConfig):
41
+ password: Optional[str] = Field(default=None, description="password when using basic auth")
42
+
43
+
44
+ class OpenSearchClientInput(BaseModel):
45
+ http_auth: Secret[Optional[tuple[str, str]]] = None
46
+ hosts: Optional[list[str]] = None
47
+ use_ssl: bool = False
48
+ verify_certs: bool = False
49
+ ssl_show_warn: bool = False
50
+ ca_certs: Optional[str] = None
51
+ client_cert: Optional[str] = None
52
+ client_key: Optional[str] = None
53
+
54
+
55
+ class OpenSearchConnectionConfig(ConnectionConfig):
56
+ hosts: Optional[list[str]] = Field(
57
+ default=None,
58
+ description="List of the OpenSearch hosts to connect",
59
+ examples=["http://localhost:9200"],
60
+ )
61
+ username: Optional[str] = Field(default=None, description="username when using basic auth")
62
+ use_ssl: bool = Field(default=False, description="use ssl for the connection")
63
+ verify_certs: bool = Field(default=False, description="whether to verify SSL certificates")
64
+ ssl_show_warn: bool = Field(
65
+ default=False, description="show warning when verify certs is disabled"
66
+ )
67
+ ca_certs: Optional[Path] = Field(default=None, description="path to CA bundle")
68
+ client_cert: Optional[Path] = Field(
69
+ default=None,
70
+ description="path to the file containing the private key and the certificate,"
71
+ " or cert only if using client_key",
72
+ )
73
+ client_key: Optional[Path] = Field(
74
+ default=None,
75
+ description="path to the file containing the private key"
76
+ " if using separate cert and key files",
77
+ )
78
+
79
+ access_config: Secret[OpenSearchAccessConfig]
80
+
81
+ def get_client_kwargs(self) -> dict:
82
+ # Update auth related fields to conform to what the SDK expects based on the
83
+ # supported methods:
84
+ # https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
85
+ access_config = self.access_config.get_secret_value()
86
+ client_input_kwargs = {}
87
+ if self.hosts:
88
+ client_input_kwargs["hosts"] = self.hosts
89
+ if self.use_ssl:
90
+ client_input_kwargs["use_ssl"] = self.use_ssl
91
+ if self.verify_certs:
92
+ client_input_kwargs["verify_certs"] = self.verify_certs
93
+ if self.ssl_show_warn:
94
+ client_input_kwargs["ssl_show_warn"] = self.ssl_show_warn
95
+ if self.ca_certs:
96
+ client_input_kwargs["ca_certs"] = str(self.ca_certs)
97
+ if self.client_cert:
98
+ client_input_kwargs["client_cert"] = str(self.client_cert)
99
+ if self.client_key:
100
+ client_input_kwargs["client_key"] = str(self.client_key)
101
+ if self.username and access_config.password:
102
+ client_input_kwargs["http_auth"] = (self.username, access_config.password)
103
+ client_input = OpenSearchClientInput(**client_input_kwargs)
104
+ logger.debug(f"opensearch client inputs mapped to: {client_input.model_dump()}")
105
+ client_kwargs = client_input.model_dump()
106
+ if client_input.http_auth is not None:
107
+ client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
108
+ client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
109
+ return client_kwargs
110
+
111
+ @DestinationConnectionError.wrap
112
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
113
+ def get_client(self) -> "OpenSearch":
114
+ from opensearchpy import OpenSearch
115
+
116
+ return OpenSearch(**self.get_client_kwargs())
117
+
118
+
119
+ class OpenSearchIndexerConfig(ElasticsearchIndexerConfig):
120
+ pass
121
+
122
+
123
+ @dataclass
124
+ class OpenSearchIndexer(ElasticsearchIndexer):
125
+ connection_config: OpenSearchConnectionConfig
126
+ index_config: OpenSearchIndexerConfig
127
+ client: "OpenSearch" = field(init=False)
128
+
129
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
130
+ def load_scan(self):
131
+ from opensearchpy.helpers import scan
132
+
133
+ return scan
134
+
135
+
136
+ class OpenSearchDownloaderConfig(ElasticsearchDownloaderConfig):
137
+ pass
138
+
139
+
140
+ @dataclass
141
+ class OpenSearchDownloader(ElasticsearchDownloader):
142
+ connection_config: OpenSearchConnectionConfig
143
+ download_config: OpenSearchDownloaderConfig
144
+ connector_type: str = CONNECTOR_TYPE
145
+
146
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
147
+ def load_async(self):
148
+ from opensearchpy import AsyncOpenSearch
149
+ from opensearchpy.helpers import async_scan
150
+
151
+ return AsyncOpenSearch, async_scan
152
+
153
+
154
+ class OpenSearchUploaderConfig(ElasticsearchUploaderConfig):
155
+ pass
156
+
157
+
158
+ @dataclass
159
+ class OpenSearchUploader(ElasticsearchUploader):
160
+ connection_config: OpenSearchConnectionConfig
161
+ upload_config: OpenSearchUploaderConfig
162
+ connector_type: str = CONNECTOR_TYPE
163
+
164
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
165
+ def load_parallel_bulk(self):
166
+ from opensearchpy.helpers import parallel_bulk
167
+
168
+ return parallel_bulk
169
+
170
+
171
+ class OpenSearchUploadStagerConfig(ElasticsearchUploadStagerConfig):
172
+ pass
173
+
174
+
175
+ @dataclass
176
+ class OpenSearchUploadStager(ElasticsearchUploadStager):
177
+ upload_stager_config: OpenSearchUploadStagerConfig
178
+
179
+
180
+ opensearch_source_entry = SourceRegistryEntry(
181
+ connection_config=OpenSearchConnectionConfig,
182
+ indexer=OpenSearchIndexer,
183
+ indexer_config=OpenSearchIndexerConfig,
184
+ downloader=OpenSearchDownloader,
185
+ downloader_config=OpenSearchDownloaderConfig,
186
+ )
187
+
188
+
189
+ opensearch_destination_entry = DestinationRegistryEntry(
190
+ connection_config=OpenSearchConnectionConfig,
191
+ upload_stager_config=OpenSearchUploadStagerConfig,
192
+ upload_stager=OpenSearchUploadStager,
193
+ uploader_config=OpenSearchUploaderConfig,
194
+ uploader=OpenSearchUploader,
195
+ )