unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,334 @@
1
+ import hashlib
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from datetime import timedelta
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Generator, List
8
+
9
+ from pydantic import BaseModel, Field, Secret
10
+
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
16
+ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.interfaces import (
19
+ AccessConfig,
20
+ BatchFileData,
21
+ BatchItem,
22
+ ConnectionConfig,
23
+ Downloader,
24
+ DownloaderConfig,
25
+ DownloadResponse,
26
+ FileData,
27
+ FileDataSourceMetadata,
28
+ Indexer,
29
+ IndexerConfig,
30
+ SourceIdentifiers,
31
+ Uploader,
32
+ UploaderConfig,
33
+ UploadStager,
34
+ UploadStagerConfig,
35
+ download_responses,
36
+ )
37
+ from unstructured_ingest.v2.logger import logger
38
+ from unstructured_ingest.v2.processes.connector_registry import (
39
+ DestinationRegistryEntry,
40
+ SourceRegistryEntry,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from couchbase.cluster import Cluster
45
+ from couchbase.collection import Collection
46
+
47
+ CONNECTOR_TYPE = "couchbase"
48
+ SERVER_API_VERSION = "1"
49
+
50
+
51
+ class CouchbaseAdditionalMetadata(BaseModel):
52
+ bucket: str
53
+
54
+
55
+ class CouchbaseBatchFileData(BatchFileData):
56
+ additional_metadata: CouchbaseAdditionalMetadata
57
+
58
+
59
+ class CouchbaseAccessConfig(AccessConfig):
60
+ password: str = Field(description="The password for the Couchbase server")
61
+
62
+
63
+ class CouchbaseConnectionConfig(ConnectionConfig):
64
+ username: str = Field(description="The username for the Couchbase server")
65
+ bucket: str = Field(description="The bucket to connect to on the Couchbase server")
66
+ connection_string: str = Field(
67
+ default="couchbase://localhost", description="The connection string of the Couchbase server"
68
+ )
69
+ scope: str = Field(
70
+ default="_default", description="The scope to connect to on the Couchbase server"
71
+ )
72
+ collection: str = Field(
73
+ default="_default", description="The collection to connect to on the Couchbase server"
74
+ )
75
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
76
+ access_config: Secret[CouchbaseAccessConfig]
77
+
78
+ @requires_dependencies(["couchbase"], extras="couchbase")
79
+ @contextmanager
80
+ def get_client(self) -> Generator["Cluster", None, None]:
81
+ from couchbase.auth import PasswordAuthenticator
82
+ from couchbase.cluster import Cluster
83
+ from couchbase.options import ClusterOptions
84
+
85
+ auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
86
+ options = ClusterOptions(auth)
87
+ options.apply_profile("wan_development")
88
+ cluster = None
89
+ try:
90
+ cluster = Cluster(self.connection_string, options)
91
+ cluster.wait_until_ready(timedelta(seconds=5))
92
+ yield cluster
93
+ finally:
94
+ if cluster:
95
+ cluster.close()
96
+
97
+
98
+ class CouchbaseUploadStagerConfig(UploadStagerConfig):
99
+ pass
100
+
101
+
102
+ @dataclass
103
+ class CouchbaseUploadStager(UploadStager):
104
+ upload_stager_config: CouchbaseUploadStagerConfig = field(
105
+ default_factory=lambda: CouchbaseUploadStagerConfig()
106
+ )
107
+
108
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
109
+ data = element_dict.copy()
110
+ return {
111
+ data["element_id"]: {
112
+ "embedding": data.get("embeddings", None),
113
+ "text": data.get("text", None),
114
+ "metadata": data.get("metadata", None),
115
+ "type": data.get("type", None),
116
+ }
117
+ }
118
+
119
+
120
+ class CouchbaseUploaderConfig(UploaderConfig):
121
+ batch_size: int = Field(default=50, description="Number of documents to upload per batch")
122
+
123
+
124
+ @dataclass
125
+ class CouchbaseUploader(Uploader):
126
+ connection_config: CouchbaseConnectionConfig
127
+ upload_config: CouchbaseUploaderConfig
128
+ connector_type: str = CONNECTOR_TYPE
129
+
130
+ def precheck(self) -> None:
131
+ try:
132
+ self.connection_config.get_client()
133
+ except Exception as e:
134
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
135
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
136
+
137
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
138
+ logger.info(
139
+ f"writing {len(data)} objects to destination "
140
+ f"bucket, {self.connection_config.bucket} "
141
+ f"at {self.connection_config.connection_string}",
142
+ )
143
+ with self.connection_config.get_client() as client:
144
+ bucket = client.bucket(self.connection_config.bucket)
145
+ scope = bucket.scope(self.connection_config.scope)
146
+ collection = scope.collection(self.connection_config.collection)
147
+
148
+ for chunk in batch_generator(data, self.upload_config.batch_size):
149
+ collection.upsert_multi(
150
+ {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
151
+ )
152
+
153
+
154
+ class CouchbaseIndexerConfig(IndexerConfig):
155
+ batch_size: int = Field(default=50, description="Number of documents to index per batch")
156
+
157
+
158
+ @dataclass
159
+ class CouchbaseIndexer(Indexer):
160
+ connection_config: CouchbaseConnectionConfig
161
+ index_config: CouchbaseIndexerConfig
162
+ connector_type: str = CONNECTOR_TYPE
163
+
164
+ def precheck(self) -> None:
165
+ try:
166
+ self.connection_config.get_client()
167
+ except Exception as e:
168
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
169
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
170
+
171
+ @requires_dependencies(["couchbase"], extras="couchbase")
172
+ def _get_doc_ids(self) -> List[str]:
173
+ query = (
174
+ f"SELECT META(d).id "
175
+ f"FROM `{self.connection_config.bucket}`."
176
+ f"`{self.connection_config.scope}`."
177
+ f"`{self.connection_config.collection}` as d"
178
+ )
179
+
180
+ max_attempts = 5
181
+ attempts = 0
182
+ while attempts < max_attempts:
183
+ try:
184
+ with self.connection_config.get_client() as client:
185
+ result = client.query(query)
186
+ document_ids = [row["id"] for row in result]
187
+ return document_ids
188
+ except Exception as e:
189
+ attempts += 1
190
+ time.sleep(3)
191
+ if attempts == max_attempts:
192
+ raise SourceConnectionError(f"failed to get document ids: {e}")
193
+
194
+ def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
195
+ ids = self._get_doc_ids()
196
+ for batch in batch_generator(ids, self.index_config.batch_size):
197
+ # Make sure the hash is always a positive number to create identified
198
+ yield CouchbaseBatchFileData(
199
+ connector_type=CONNECTOR_TYPE,
200
+ metadata=FileDataSourceMetadata(
201
+ url=f"{self.connection_config.connection_string}/"
202
+ f"{self.connection_config.bucket}",
203
+ date_processed=str(time.time()),
204
+ ),
205
+ additional_metadata=CouchbaseAdditionalMetadata(
206
+ bucket=self.connection_config.bucket
207
+ ),
208
+ batch_items=[BatchItem(identifier=b) for b in batch],
209
+ )
210
+
211
+
212
+ class CouchbaseDownloaderConfig(DownloaderConfig):
213
+ collection_id: str = Field(
214
+ default="id", description="The unique key of the id field in the collection"
215
+ )
216
+ fields: list[str] = field(default_factory=list)
217
+
218
+
219
+ @dataclass
220
+ class CouchbaseDownloader(Downloader):
221
+ connection_config: CouchbaseConnectionConfig
222
+ download_config: CouchbaseDownloaderConfig
223
+ connector_type: str = CONNECTOR_TYPE
224
+
225
+ def is_async(self) -> bool:
226
+ return False
227
+
228
+ def get_identifier(self, bucket: str, record_id: str) -> str:
229
+ f = f"{bucket}-{record_id}"
230
+ if self.download_config.fields:
231
+ f = "{}-{}".format(
232
+ f,
233
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
234
+ )
235
+ return f
236
+
237
+ def map_cb_results(self, cb_results: dict) -> str:
238
+ doc_body = cb_results
239
+ flattened_dict = flatten_dict(dictionary=doc_body)
240
+ str_values = [str(value) for value in flattened_dict.values()]
241
+ concatenated_values = "\n".join(str_values)
242
+ return concatenated_values
243
+
244
+ def generate_download_response(
245
+ self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
246
+ ) -> DownloadResponse:
247
+ record_id = result[self.download_config.collection_id]
248
+ filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
249
+ filename = f"{filename_id}.txt"
250
+ download_path = self.download_dir / Path(filename)
251
+ logger.debug(
252
+ f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
253
+ )
254
+ download_path.parent.mkdir(parents=True, exist_ok=True)
255
+ try:
256
+ with open(download_path, "w", encoding="utf8") as f:
257
+ f.write(self.map_cb_results(cb_results=result))
258
+ except Exception as e:
259
+ logger.error(
260
+ f"failed to download from bucket {bucket} "
261
+ f"and id {record_id} to {download_path}: {e}",
262
+ exc_info=True,
263
+ )
264
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
265
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
266
+ cast_file_data = FileData.cast(file_data=file_data)
267
+ cast_file_data.identifier = filename_id
268
+ cast_file_data.metadata.date_processed = str(time.time())
269
+ cast_file_data.metadata.record_locator = {
270
+ "connection_string": self.connection_config.connection_string,
271
+ "bucket": bucket,
272
+ "scope": self.connection_config.scope,
273
+ "collection": self.connection_config.collection,
274
+ "document_id": record_id,
275
+ }
276
+ return super().generate_download_response(
277
+ file_data=cast_file_data,
278
+ download_path=download_path,
279
+ )
280
+
281
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
282
+ couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
283
+ bucket_name: str = couchbase_file_data.additional_metadata.bucket
284
+ ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
285
+
286
+ with self.connection_config.get_client() as client:
287
+ bucket = client.bucket(bucket_name)
288
+ scope = bucket.scope(self.connection_config.scope)
289
+ collection = scope.collection(self.connection_config.collection)
290
+
291
+ download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
292
+ return list(download_resp)
293
+
294
+ def process_doc_id(
295
+ self,
296
+ doc_id: str,
297
+ collection: "Collection",
298
+ bucket_name: str,
299
+ file_data: CouchbaseBatchFileData,
300
+ ):
301
+ result = collection.get(doc_id)
302
+ return self.generate_download_response(
303
+ result=result.content_as[dict], bucket=bucket_name, file_data=file_data
304
+ )
305
+
306
+ def process_all_doc_ids(
307
+ self,
308
+ ids: list[str],
309
+ collection: "Collection",
310
+ bucket_name: str,
311
+ file_data: CouchbaseBatchFileData,
312
+ ):
313
+ for doc_id in ids:
314
+ yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
315
+
316
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
317
+ raise NotImplementedError()
318
+
319
+
320
+ couchbase_destination_entry = DestinationRegistryEntry(
321
+ connection_config=CouchbaseConnectionConfig,
322
+ uploader=CouchbaseUploader,
323
+ uploader_config=CouchbaseUploaderConfig,
324
+ upload_stager=CouchbaseUploadStager,
325
+ upload_stager_config=CouchbaseUploadStagerConfig,
326
+ )
327
+
328
+ couchbase_source_entry = SourceRegistryEntry(
329
+ connection_config=CouchbaseConnectionConfig,
330
+ indexer=CouchbaseIndexer,
331
+ indexer_config=CouchbaseIndexerConfig,
332
+ downloader=CouchbaseDownloader,
333
+ downloader_config=CouchbaseDownloaderConfig,
334
+ )
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
9
+ from .volumes_aws import (
10
+ databricks_aws_volumes_destination_entry,
11
+ databricks_aws_volumes_source_entry,
12
+ )
13
+ from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
14
+ from .volumes_azure import (
15
+ databricks_azure_volumes_destination_entry,
16
+ databricks_azure_volumes_source_entry,
17
+ )
18
+ from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
19
+ from .volumes_gcp import (
20
+ databricks_gcp_volumes_destination_entry,
21
+ databricks_gcp_volumes_source_entry,
22
+ )
23
+ from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
24
+ from .volumes_native import (
25
+ databricks_native_volumes_destination_entry,
26
+ databricks_native_volumes_source_entry,
27
+ )
28
+
29
+ add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
30
+ add_destination_entry(
31
+ destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
32
+ )
33
+
34
+ add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
35
+ add_destination_entry(
36
+ destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
37
+ )
38
+
39
+ add_source_entry(
40
+ source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
41
+ )
42
+ add_destination_entry(
43
+ destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
44
+ entry=databricks_native_volumes_destination_entry,
45
+ )
46
+
47
+ add_source_entry(
48
+ source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
49
+ )
50
+ add_destination_entry(
51
+ destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
52
+ )
@@ -0,0 +1,208 @@
1
+ import os
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
+ from uuid import NAMESPACE_DNS, uuid5
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import (
12
+ ProviderError,
13
+ RateLimitError,
14
+ UserAuthError,
15
+ UserError,
16
+ )
17
+ from unstructured_ingest.v2.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ Downloader,
21
+ DownloaderConfig,
22
+ DownloadResponse,
23
+ FileData,
24
+ FileDataSourceMetadata,
25
+ Indexer,
26
+ IndexerConfig,
27
+ SourceIdentifiers,
28
+ Uploader,
29
+ UploaderConfig,
30
+ )
31
+ from unstructured_ingest.v2.logger import logger
32
+
33
+ if TYPE_CHECKING:
34
+ from databricks.sdk import WorkspaceClient
35
+
36
+
37
+ class DatabricksPathMixin(BaseModel):
38
+ volume: str = Field(description="Name of volume in the Unity Catalog")
39
+ catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
40
+ volume_path: Optional[str] = Field(
41
+ default=None, description="Optional path within the volume to write to"
42
+ )
43
+ databricks_schema: str = Field(
44
+ default="default",
45
+ alias="schema",
46
+ description="Schema associated with the volume to write to in the Unity Catalog service",
47
+ )
48
+
49
+ @property
50
+ def path(self) -> str:
51
+ path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
52
+ if self.volume_path:
53
+ path = f"{path}/{self.volume_path}"
54
+ return path
55
+
56
+
57
+ class DatabricksVolumesAccessConfig(AccessConfig):
58
+ token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
59
+
60
+
61
+ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
62
+ host: Optional[str] = Field(
63
+ default=None,
64
+ description="The Databricks host URL for either the "
65
+ "Databricks workspace endpoint or the "
66
+ "Databricks accounts endpoint.",
67
+ )
68
+
69
+ def wrap_error(self, e: Exception) -> Exception:
70
+ from databricks.sdk.errors.base import DatabricksError
71
+ from databricks.sdk.errors.platform import STATUS_CODE_MAPPING
72
+
73
+ if isinstance(e, ValueError):
74
+ error_message = e.args[0]
75
+ message_split = error_message.split(":")
76
+ if message_split[0].endswith("auth"):
77
+ return UserAuthError(e)
78
+ if isinstance(e, DatabricksError):
79
+ reverse_mapping = {v: k for k, v in STATUS_CODE_MAPPING.items()}
80
+ if status_code := reverse_mapping.get(type(e)):
81
+ if status_code in [401, 403]:
82
+ return UserAuthError(e)
83
+ if status_code == 429:
84
+ return RateLimitError(e)
85
+ if 400 <= status_code < 500:
86
+ return UserError(e)
87
+ if 500 <= status_code < 600:
88
+ return ProviderError(e)
89
+ logger.error(f"unhandled exception from databricks: {e}", exc_info=True)
90
+ return e
91
+
92
+ @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
93
+ def get_client(self) -> "WorkspaceClient":
94
+ from databricks.sdk import WorkspaceClient
95
+
96
+ return WorkspaceClient(
97
+ host=self.host,
98
+ **self.access_config.get_secret_value().model_dump(),
99
+ )
100
+
101
+
102
+ class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
103
+ recursive: bool = False
104
+
105
+
106
+ @dataclass
107
+ class DatabricksVolumesIndexer(Indexer, ABC):
108
+ index_config: DatabricksVolumesIndexerConfig
109
+ connection_config: DatabricksVolumesConnectionConfig
110
+
111
+ def precheck(self) -> None:
112
+ try:
113
+ self.connection_config.get_client()
114
+ except Exception as e:
115
+ raise self.connection_config.wrap_error(e=e) from e
116
+
117
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
118
+ try:
119
+ for file_info in self.connection_config.get_client().dbfs.list(
120
+ path=self.index_config.path, recursive=self.index_config.recursive
121
+ ):
122
+ if file_info.is_dir:
123
+ continue
124
+ rel_path = file_info.path.replace(self.index_config.path, "")
125
+ if rel_path.startswith("/"):
126
+ rel_path = rel_path[1:]
127
+ filename = Path(file_info.path).name
128
+ yield FileData(
129
+ identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
130
+ connector_type=self.connector_type,
131
+ source_identifiers=SourceIdentifiers(
132
+ filename=filename,
133
+ rel_path=rel_path,
134
+ fullpath=file_info.path,
135
+ ),
136
+ additional_metadata={
137
+ "catalog": self.index_config.catalog,
138
+ "path": file_info.path,
139
+ },
140
+ metadata=FileDataSourceMetadata(
141
+ url=file_info.path, date_modified=str(file_info.modification_time)
142
+ ),
143
+ )
144
+ except Exception as e:
145
+ raise self.connection_config.wrap_error(e=e)
146
+
147
+
148
+ class DatabricksVolumesDownloaderConfig(DownloaderConfig):
149
+ pass
150
+
151
+
152
+ @dataclass
153
+ class DatabricksVolumesDownloader(Downloader, ABC):
154
+ download_config: DatabricksVolumesDownloaderConfig
155
+ connection_config: DatabricksVolumesConnectionConfig
156
+
157
+ def precheck(self) -> None:
158
+ try:
159
+ self.connection_config.get_client()
160
+ except Exception as e:
161
+ raise self.connection_config.wrap_error(e=e)
162
+
163
+ def get_download_path(self, file_data: FileData) -> Path:
164
+ return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
165
+
166
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
167
+ download_path = self.get_download_path(file_data=file_data)
168
+ download_path.parent.mkdir(parents=True, exist_ok=True)
169
+ volumes_path = file_data.additional_metadata["path"]
170
+ logger.info(f"Writing {file_data.identifier} to {download_path}")
171
+ try:
172
+ with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
173
+ read_content = c._read_handle.read()
174
+ except Exception as e:
175
+ raise self.connection_config.wrap_error(e=e)
176
+ with open(download_path, "wb") as f:
177
+ f.write(read_content)
178
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
179
+
180
+
181
+ class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
182
+ pass
183
+
184
+
185
+ @dataclass
186
+ class DatabricksVolumesUploader(Uploader, ABC):
187
+ upload_config: DatabricksVolumesUploaderConfig
188
+ connection_config: DatabricksVolumesConnectionConfig
189
+
190
+ def precheck(self) -> None:
191
+ try:
192
+ assert self.connection_config.get_client().current_user.me().active
193
+ except Exception as e:
194
+ raise self.connection_config.wrap_error(e=e)
195
+
196
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
197
+ output_path = os.path.join(
198
+ self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
199
+ )
200
+ with open(path, "rb") as elements_file:
201
+ try:
202
+ self.connection_config.get_client().files.upload(
203
+ file_path=output_path,
204
+ contents=elements_file,
205
+ overwrite=True,
206
+ )
207
+ except Exception as e:
208
+ raise self.connection_config.wrap_error(e=e)
@@ -0,0 +1,87 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_aws"
22
+
23
+
24
+ class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint",
28
+ )
29
+ profile: Optional[str] = None
30
+ token: Optional[str] = Field(
31
+ default=None,
32
+ description="The Databricks personal access token (PAT)",
33
+ )
34
+
35
+
36
+ class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
37
+ access_config: Secret[DatabricksAWSVolumesAccessConfig]
38
+
39
+
40
+ class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
41
+ pass
42
+
43
+
44
+ @dataclass
45
+ class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
46
+ connection_config: DatabricksAWSVolumesConnectionConfig
47
+ index_config: DatabricksAWSVolumesIndexerConfig
48
+ connector_type: str = CONNECTOR_TYPE
49
+
50
+
51
+ class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
52
+ pass
53
+
54
+
55
+ @dataclass
56
+ class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
57
+ connection_config: DatabricksAWSVolumesConnectionConfig
58
+ download_config: DatabricksVolumesDownloaderConfig
59
+ connector_type: str = CONNECTOR_TYPE
60
+
61
+
62
+ class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
63
+ pass
64
+
65
+
66
+ @dataclass
67
+ class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
68
+ connection_config: DatabricksAWSVolumesConnectionConfig
69
+ upload_config: DatabricksAWSVolumesUploaderConfig = field(
70
+ default_factory=DatabricksAWSVolumesUploaderConfig
71
+ )
72
+ connector_type: str = CONNECTOR_TYPE
73
+
74
+
75
+ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
+ connection_config=DatabricksAWSVolumesConnectionConfig,
77
+ uploader=DatabricksAWSVolumesUploader,
78
+ uploader_config=DatabricksAWSVolumesUploaderConfig,
79
+ )
80
+
81
+ databricks_aws_volumes_source_entry = SourceRegistryEntry(
82
+ connection_config=DatabricksAWSVolumesConnectionConfig,
83
+ indexer=DatabricksAWSVolumesIndexer,
84
+ indexer_config=DatabricksAWSVolumesIndexerConfig,
85
+ downloader=DatabricksAWSVolumesDownloader,
86
+ downloader_config=DatabricksAWSVolumesDownloaderConfig,
87
+ )