unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,218 @@
1
+ import typing as t
2
+ from dataclasses import dataclass, field
3
+
4
+ from dataclasses_json.core import Json
5
+
6
+ from unstructured_ingest.connector.elasticsearch import (
7
+ ElasticsearchDestinationConnector,
8
+ ElasticsearchDocumentMeta,
9
+ ElasticsearchIngestDoc,
10
+ ElasticsearchIngestDocBatch,
11
+ ElasticsearchSourceConnector,
12
+ SimpleElasticsearchConfig,
13
+ )
14
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
15
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
16
+ from unstructured_ingest.interfaces import AccessConfig, BaseSingleIngestDoc
17
+ from unstructured_ingest.logger import logger
18
+ from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
19
+ from unstructured_ingest.utils.dep_check import requires_dependencies
20
+
21
+ if t.TYPE_CHECKING:
22
+ from opensearchpy import OpenSearch
23
+
24
+ """Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
25
+ heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
26
+
27
+
28
+ @dataclass
29
+ class OpenSearchAccessConfig(AccessConfig):
30
+ hosts: t.Optional[t.List[str]] = None
31
+ username: t.Optional[str] = None
32
+ password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
33
+ use_ssl: bool = False
34
+ verify_certs: bool = False
35
+ ssl_show_warn: bool = False
36
+ ca_certs: t.Optional[str] = None
37
+ client_cert: t.Optional[str] = None
38
+ client_key: t.Optional[str] = None
39
+
40
+ def to_dict(self, **kwargs) -> t.Dict[str, Json]:
41
+ d = super().to_dict(**kwargs)
42
+ d["http_auth"] = (self.username, self.password)
43
+ return d
44
+
45
+
46
+ @dataclass
47
+ class SimpleOpenSearchConfig(SimpleElasticsearchConfig):
48
+ access_config: OpenSearchAccessConfig = None
49
+
50
+
51
+ @dataclass
52
+ class OpenSearchIngestDoc(ElasticsearchIngestDoc):
53
+ """Class encapsulating fetching a doc and writing processed results (but not
54
+ doing the processing!).
55
+
56
+ Current implementation creates a python OpenSearch client to fetch each doc,
57
+ rather than creating a client for each thread.
58
+ """
59
+
60
+ connector_config: SimpleOpenSearchConfig
61
+ registry_name: str = "opensearch"
62
+
63
+ @SourceConnectionError.wrap
64
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
65
+ @BaseSingleIngestDoc.skip_if_file_exists
66
+ def get_file(self):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class OpenSearchIngestDocBatch(ElasticsearchIngestDocBatch):
72
+ connector_config: SimpleOpenSearchConfig
73
+ ingest_docs: t.List[OpenSearchIngestDoc] = field(default_factory=list)
74
+ registry_name: str = "opensearch_batch"
75
+
76
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
77
+ def _get_docs(self):
78
+ from opensearchpy import OpenSearch
79
+ from opensearchpy.helpers import scan
80
+
81
+ ops = OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
82
+ scan_query = {
83
+ "_source": self.connector_config.fields,
84
+ "version": True,
85
+ "query": {"ids": {"values": self.list_of_ids}},
86
+ }
87
+
88
+ result = scan(
89
+ ops,
90
+ query=scan_query,
91
+ scroll="1m",
92
+ index=self.connector_config.index_name,
93
+ )
94
+ return list(result)
95
+
96
+ @SourceConnectionError.wrap
97
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
98
+ def get_files(self):
99
+ documents = self._get_docs()
100
+ for doc in documents:
101
+ ingest_doc = OpenSearchIngestDoc(
102
+ processor_config=self.processor_config,
103
+ read_config=self.read_config,
104
+ connector_config=self.connector_config,
105
+ document=doc,
106
+ document_meta=ElasticsearchDocumentMeta(
107
+ self.connector_config.index_name, doc["_id"]
108
+ ),
109
+ )
110
+ ingest_doc.update_source_metadata()
111
+ doc_body = doc["_source"]
112
+ filename = ingest_doc.filename
113
+ flattened_dict = flatten_dict(dictionary=doc_body)
114
+ str_values = [str(value) for value in flattened_dict.values()]
115
+ concatenated_values = "\n".join(str_values)
116
+
117
+ filename.parent.mkdir(parents=True, exist_ok=True)
118
+ with open(filename, "w", encoding="utf8") as f:
119
+ f.write(concatenated_values)
120
+ self.ingest_docs.append(ingest_doc)
121
+
122
+
123
+ @dataclass
124
+ class OpenSearchSourceConnector(ElasticsearchSourceConnector):
125
+ """Fetches particular fields from all documents in a given opensearch cluster and index"""
126
+
127
+ connector_config: SimpleOpenSearchConfig
128
+ _ops: t.Optional["OpenSearch"] = field(init=False, default=None)
129
+
130
+ @property
131
+ def ops(self):
132
+ from opensearchpy import OpenSearch
133
+
134
+ if self._ops is None:
135
+ self._ops = OpenSearch(
136
+ **self.connector_config.access_config.to_dict(apply_name_overload=False)
137
+ )
138
+ return self._ops
139
+
140
+ def check_connection(self):
141
+ try:
142
+ assert self.ops.ping()
143
+ except Exception as e:
144
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
145
+ raise SourceConnectionError(f"failed to validate connection: {e}")
146
+
147
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
148
+ def _get_doc_ids(self):
149
+ """Fetches all document ids in an index"""
150
+ from opensearchpy.helpers import scan
151
+
152
+ hits = scan(
153
+ self.ops,
154
+ query=self.scan_query,
155
+ scroll="1m",
156
+ index=self.connector_config.index_name,
157
+ )
158
+
159
+ return [hit["_id"] for hit in hits]
160
+
161
+ def get_ingest_docs(self):
162
+ """Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
163
+ ids = self._get_doc_ids()
164
+ id_batches = [
165
+ ids[
166
+ i
167
+ * self.connector_config.batch_size : (i + 1) # noqa
168
+ * self.connector_config.batch_size
169
+ ]
170
+ for i in range(
171
+ (len(ids) + self.connector_config.batch_size - 1)
172
+ // self.connector_config.batch_size
173
+ )
174
+ ]
175
+ return [
176
+ OpenSearchIngestDocBatch(
177
+ connector_config=self.connector_config,
178
+ processor_config=self.processor_config,
179
+ read_config=self.read_config,
180
+ list_of_ids=batched_ids,
181
+ )
182
+ for batched_ids in id_batches
183
+ ]
184
+
185
+
186
+ @dataclass
187
+ class OpenSearchDestinationConnector(ElasticsearchDestinationConnector):
188
+ connector_config: SimpleOpenSearchConfig
189
+ _client: t.Optional["OpenSearch"] = field(init=False, default=None)
190
+
191
+ @DestinationConnectionError.wrap
192
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
193
+ def generate_client(self) -> "OpenSearch":
194
+ from opensearchpy import OpenSearch
195
+
196
+ return OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
197
+
198
+ @requires_dependencies(["opensearchpy"], extras="opensearch")
199
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]]) -> None:
200
+ logger.info(
201
+ f"writing document batches to destination"
202
+ f" index named {self.connector_config.index_name}"
203
+ f" at {self.connector_config.access_config.hosts}"
204
+ f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
205
+ f" with {self.write_config.num_processes} (number of) processes"
206
+ )
207
+ from opensearchpy.helpers import parallel_bulk
208
+
209
+ for batch in generator_batching_wbytes(
210
+ elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
211
+ ):
212
+ for success, info in parallel_bulk(
213
+ self.client, batch, thread_count=self.write_config.num_processes
214
+ ):
215
+ if not success:
216
+ logger.error(
217
+ "upload failed for a batch in opensearch destination connector:", info
218
+ )
@@ -0,0 +1,285 @@
1
+ import hashlib
2
+ import os
3
+ import typing as t
4
+ from collections import defaultdict
5
+ from dataclasses import dataclass, field
6
+ from itertools import chain
7
+ from pathlib import Path
8
+
9
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ BaseConnectorConfig,
14
+ BaseSingleIngestDoc,
15
+ BaseSourceConnector,
16
+ IngestDocCleanupMixin,
17
+ SourceConnectorCleanupMixin,
18
+ SourceMetadata,
19
+ )
20
+ from unstructured_ingest.logger import logger
21
+ from unstructured_ingest.utils.dep_check import requires_dependencies
22
+
23
+ MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder
24
+ if t.TYPE_CHECKING:
25
+ from office365.graph_client import GraphClient
26
+
27
+
28
+ class MissingFolderError(Exception):
29
+ """There are no root folders with those names."""
30
+
31
+
32
+ @dataclass
33
+ class OutlookAccessConfig(AccessConfig):
34
+ client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred")
35
+
36
+
37
+ @dataclass
38
+ class SimpleOutlookConfig(BaseConnectorConfig):
39
+ """This class is getting the token."""
40
+
41
+ access_config: OutlookAccessConfig
42
+ user_email: str
43
+ client_id: str
44
+ tenant: t.Optional[str] = field(repr=False, default="common")
45
+ authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com")
46
+ outlook_folders: t.List[str] = field(default_factory=list)
47
+ recursive: bool = False
48
+ registry_name: str = "outlook"
49
+
50
+ def __post_init__(self):
51
+ if not (self.client_id and self.access_config.client_credential and self.user_email):
52
+ raise ValueError(
53
+ "Please provide one of the following mandatory values:"
54
+ "\nclient_id\nclient_cred\nuser_email",
55
+ )
56
+ self.token_factory = self._acquire_token
57
+
58
+ @requires_dependencies(["msal"])
59
+ def _acquire_token(self):
60
+ from msal import ConfidentialClientApplication
61
+
62
+ try:
63
+ app = ConfidentialClientApplication(
64
+ authority=f"{self.authority_url}/{self.tenant}",
65
+ client_id=self.client_id,
66
+ client_credential=self.access_config.client_credential,
67
+ )
68
+ token = app.acquire_token_for_client(
69
+ scopes=["https://graph.microsoft.com/.default"],
70
+ )
71
+ except ValueError as exc:
72
+ logger.error("Couldn't set up credentials for Outlook")
73
+ raise exc
74
+ return token
75
+
76
+ @requires_dependencies(["office365"], extras="outlook")
77
+ def _get_client(self):
78
+ from office365.graph_client import GraphClient
79
+
80
+ return GraphClient(self.token_factory)
81
+
82
+
83
+ @dataclass
84
+ class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
85
+ connector_config: SimpleOutlookConfig
86
+ message_id: str
87
+ registry_name: str = "outlook"
88
+
89
+ def __post_init__(self):
90
+ self._set_download_paths()
91
+
92
+ def hash_mail_name(self, id):
93
+ """Outlook email ids are 152 char long. Hash to shorten to 16."""
94
+ return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16]
95
+
96
+ def _set_download_paths(self) -> None:
97
+ """Creates paths for downloading and parsing."""
98
+ download_path = Path(f"{self.read_config.download_dir}")
99
+ output_path = Path(f"{self.processor_config.output_dir}")
100
+
101
+ self.download_dir = download_path
102
+ self.download_filepath = (
103
+ download_path / f"{self.hash_mail_name(self.message_id)}.eml"
104
+ ).resolve()
105
+ oname = f"{self.hash_mail_name(self.message_id)}.eml.json"
106
+ self.output_dir = output_path
107
+ self.output_filepath = (output_path / oname).resolve()
108
+
109
+ @property
110
+ def filename(self):
111
+ return Path(self.download_filepath).resolve()
112
+
113
+ @property
114
+ def _output_filename(self):
115
+ return Path(self.output_filepath).resolve()
116
+
117
+ @property
118
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
119
+ return {
120
+ "message_id": self.message_id,
121
+ "user_email": self.connector_config.user_email,
122
+ }
123
+
124
+ @requires_dependencies(["office365"], extras="outlook")
125
+ def update_source_metadata(self, **kwargs):
126
+ from office365.runtime.client_request_exception import ClientRequestException
127
+
128
+ try:
129
+ client = self.connector_config._get_client()
130
+ msg = (
131
+ client.users[self.connector_config.user_email]
132
+ .messages[self.message_id]
133
+ .get()
134
+ .execute_query()
135
+ )
136
+ except ClientRequestException as e:
137
+ if e.response.status_code == 404:
138
+ self.source_metadata = SourceMetadata(
139
+ exists=False,
140
+ )
141
+ return
142
+ raise
143
+ self.source_metadata = SourceMetadata(
144
+ date_created=msg.created_datetime.isoformat(),
145
+ date_modified=msg.last_modified_datetime.isoformat(),
146
+ version=msg.get_property("changeKey"),
147
+ source_url=msg.get_property("webLink"),
148
+ exists=True,
149
+ )
150
+
151
+ @SourceConnectionNetworkError.wrap
152
+ def _run_download(self, local_file):
153
+ client = self.connector_config._get_client()
154
+ client.users[self.connector_config.user_email].messages[self.message_id].download(
155
+ local_file,
156
+ ).execute_query()
157
+
158
+ @SourceConnectionError.wrap
159
+ @BaseSingleIngestDoc.skip_if_file_exists
160
+ @requires_dependencies(["office365"], extras="outlook")
161
+ def get_file(self):
162
+ """Relies on Office365 python sdk message object to do the download."""
163
+ try:
164
+ self.connector_config._get_client()
165
+ self.update_source_metadata()
166
+ if not self.download_dir.is_dir():
167
+ logger.debug(f"creating directory: {self.download_dir}")
168
+ self.download_dir.mkdir(parents=True, exist_ok=True)
169
+
170
+ with open(
171
+ os.path.join(
172
+ self.download_dir,
173
+ self.hash_mail_name(self.message_id) + ".eml",
174
+ ),
175
+ "wb",
176
+ ) as local_file:
177
+ self._run_download(local_file=local_file)
178
+
179
+ except Exception as e:
180
+ logger.error(
181
+ f"Error while downloading and saving file: {self.hash_mail_name(self.message_id)}.",
182
+ )
183
+ logger.error(e)
184
+ return
185
+ logger.info(f"file downloaded: {self.hash_mail_name(self.message_id)}")
186
+ return
187
+
188
+
189
+ @dataclass
190
+ class OutlookSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
191
+ connector_config: SimpleOutlookConfig
192
+ _client: t.Optional["GraphClient"] = field(init=False, default=None)
193
+
194
+ @property
195
+ def client(self) -> "GraphClient":
196
+ if self._client is None:
197
+ self._client = self.connector_config._get_client()
198
+ return self._client
199
+
200
+ def initialize(self):
201
+ try:
202
+ self.get_folder_ids()
203
+ except Exception as e:
204
+ raise SourceConnectionError(f"failed to validate connection: {e}")
205
+
206
+ def check_connection(self):
207
+ try:
208
+ _ = self.client
209
+ except Exception as e:
210
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
211
+ raise SourceConnectionError(f"failed to validate connection: {e}")
212
+
213
+ def recurse_folders(self, folder_id, main_folder_dict):
214
+ """We only get a count of subfolders for any folder.
215
+ Have to make additional calls to get subfolder ids."""
216
+ subfolders = (
217
+ self.client.users[self.connector_config.user_email]
218
+ .mail_folders[folder_id]
219
+ .child_folders.get()
220
+ .execute_query()
221
+ )
222
+ for subfolder in subfolders:
223
+ for k, v in main_folder_dict.items():
224
+ if subfolder.get_property("parentFolderId") in v:
225
+ v.append(subfolder.id)
226
+ if subfolder.get_property("childFolderCount") > 0:
227
+ self.recurse_folders(subfolder.id, main_folder_dict)
228
+
229
+ def get_folder_ids(self):
230
+ """Sets the mail folder ids and subfolder ids for requested root mail folders."""
231
+ self.root_folders = defaultdict(list)
232
+ root_folders_with_subfolders = []
233
+ get_root_folders = (
234
+ self.client.users[self.connector_config.user_email].mail_folders.get().execute_query()
235
+ )
236
+
237
+ for folder in get_root_folders:
238
+ self.root_folders[folder.display_name].append(folder.id)
239
+ if folder.get_property("childFolderCount") > 0:
240
+ root_folders_with_subfolders.append(folder.id)
241
+
242
+ for folder in root_folders_with_subfolders:
243
+ self.recurse_folders(folder, self.root_folders)
244
+
245
+ # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested.
246
+ self.selected_folder_ids = list(
247
+ chain.from_iterable(
248
+ [
249
+ v
250
+ for k, v in self.root_folders.items()
251
+ if k.lower() in [x.lower() for x in self.connector_config.outlook_folders]
252
+ ],
253
+ ),
254
+ )
255
+ if not self.selected_folder_ids:
256
+ raise MissingFolderError(
257
+ "There are no root folders with the names: "
258
+ f"{self.connector_config.outlook_folders}",
259
+ )
260
+
261
+ def get_ingest_docs(self):
262
+ """Returns a list of all the message objects that are in the requested root folder(s)."""
263
+ filtered_messages = []
264
+
265
+ # Get all the relevant messages in the selected folders/subfolders.
266
+ for folder_id in self.selected_folder_ids:
267
+ messages = (
268
+ self.client.users[self.connector_config.user_email]
269
+ .mail_folders[folder_id]
270
+ .messages.get()
271
+ .top(MAX_NUM_EMAILS) # Prevents the return from paging
272
+ .execute_query()
273
+ )
274
+ # Skip empty list if there are no messages in folder.
275
+ if messages:
276
+ filtered_messages.append(messages)
277
+ return [
278
+ OutlookIngestDoc(
279
+ connector_config=self.connector_config,
280
+ processor_config=self.processor_config,
281
+ read_config=self.read_config,
282
+ message_id=message.id,
283
+ )
284
+ for message in list(chain.from_iterable(filtered_messages))
285
+ ]
@@ -0,0 +1,140 @@
1
+ import copy
2
+ import json
3
+ import multiprocessing as mp
4
+ import typing as t
5
+ import uuid
6
+ from dataclasses import dataclass
7
+
8
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
10
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
11
+ from unstructured_ingest.interfaces import (
12
+ AccessConfig,
13
+ BaseConnectorConfig,
14
+ BaseDestinationConnector,
15
+ ConfigSessionHandleMixin,
16
+ IngestDocSessionHandleMixin,
17
+ WriteConfig,
18
+ )
19
+ from unstructured_ingest.logger import logger
20
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
21
+ from unstructured_ingest.utils.dep_check import requires_dependencies
22
+
23
+ if t.TYPE_CHECKING:
24
+ from pinecone import Index as PineconeIndex
25
+
26
+
27
+ @dataclass
28
+ class PineconeAccessConfig(AccessConfig):
29
+ api_key: str = enhanced_field(sensitive=True)
30
+
31
+
32
+ @dataclass
33
+ class SimplePineconeConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
34
+ index_name: str
35
+ environment: str
36
+ access_config: PineconeAccessConfig
37
+
38
+
39
+ @dataclass
40
+ class PineconeWriteConfig(WriteConfig):
41
+ batch_size: int = 50
42
+ num_processes: int = 1
43
+
44
+
45
+ @dataclass
46
+ class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector):
47
+ write_config: PineconeWriteConfig
48
+ connector_config: SimplePineconeConfig
49
+ _index: t.Optional["PineconeIndex"] = None
50
+
51
+ def to_dict(self, **kwargs):
52
+ """
53
+ The _index variable in this dataclass breaks deepcopy due to:
54
+ TypeError: cannot pickle '_thread.lock' object
55
+ When serializing, remove it, meaning client data will need to be reinitialized
56
+ when deserialized
57
+ """
58
+ self_cp = copy.copy(self)
59
+ if hasattr(self_cp, "_index"):
60
+ setattr(self_cp, "_index", None)
61
+ return _asdict(self_cp, **kwargs)
62
+
63
+ @property
64
+ def pinecone_index(self):
65
+ if self._index is None:
66
+ self._index = self.create_index()
67
+ return self._index
68
+
69
+ def initialize(self):
70
+ pass
71
+
72
+ @requires_dependencies(["pinecone"], extras="pinecone")
73
+ def create_index(self) -> "PineconeIndex":
74
+ from pinecone import Pinecone
75
+ from unstructured import __version__ as unstructured_version
76
+
77
+ pc = Pinecone(
78
+ api_key=self.connector_config.access_config.api_key,
79
+ source_tag=f"unstructured=={unstructured_version}",
80
+ )
81
+
82
+ index = pc.Index(self.connector_config.index_name)
83
+ logger.debug(f"connected to index: {pc.describe_index(self.connector_config.index_name)}")
84
+ return index
85
+
86
+ @DestinationConnectionError.wrap
87
+ def check_connection(self):
88
+ _ = self.pinecone_index
89
+
90
+ @DestinationConnectionError.wrap
91
+ @requires_dependencies(["pinecone"], extras="pinecone")
92
+ def upsert_batch(self, batch):
93
+ import pinecone.core.client.exceptions
94
+
95
+ index = self.pinecone_index
96
+ try:
97
+ response = index.upsert(batch)
98
+ except pinecone.core.client.exceptions.ApiException as api_error:
99
+ raise WriteError(f"http error: {api_error}") from api_error
100
+ logger.debug(f"results: {response}")
101
+
102
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
103
+ logger.info(
104
+ f"Upserting {len(elements_dict)} elements to destination "
105
+ f"index at {self.connector_config.index_name}",
106
+ )
107
+
108
+ pinecone_batch_size = self.write_config.batch_size
109
+
110
+ logger.info(f"using {self.write_config.num_processes} processes to upload")
111
+ if self.write_config.num_processes == 1:
112
+ for chunk in batch_generator(elements_dict, pinecone_batch_size):
113
+ self.upsert_batch(chunk) # noqa: E203
114
+
115
+ else:
116
+ with mp.Pool(
117
+ processes=self.write_config.num_processes,
118
+ ) as pool:
119
+ pool.map(
120
+ self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size))
121
+ )
122
+
123
+ def normalize_dict(self, element_dict: dict) -> dict:
124
+ # While flatten_dict enables indexing on various fields,
125
+ # element_serialized enables easily reloading the element object to memory.
126
+ # element_serialized is formed without text/embeddings to avoid data bloating.
127
+ return {
128
+ "id": str(uuid.uuid4()),
129
+ "values": element_dict.pop("embeddings", None),
130
+ "metadata": {
131
+ "text": element_dict.pop("text", None),
132
+ "element_serialized": json.dumps(element_dict),
133
+ **flatten_dict(
134
+ element_dict,
135
+ separator="-",
136
+ flatten_lists=True,
137
+ remove_none=True,
138
+ ),
139
+ },
140
+ }