unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,102 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_azure"
22
+
23
+
24
+ class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+ azure_client_secret: Optional[str] = Field(
36
+ default=None, description="The Azure AD service principal’s client secret."
37
+ )
38
+ azure_client_id: Optional[str] = Field(
39
+ default=None, description="The Azure AD service principal’s application ID."
40
+ )
41
+ azure_tenant_id: Optional[str] = Field(
42
+ default=None, description="The Azure AD service principal’s tenant ID."
43
+ )
44
+ azure_environment: Optional[str] = Field(
45
+ default=None,
46
+ description="The Azure environment type for a " "specific set of API endpoints",
47
+ examples=["Public", "UsGov", "China", "Germany"],
48
+ )
49
+
50
+
51
+ class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
52
+ access_config: Secret[DatabricksAzureVolumesAccessConfig]
53
+
54
+
55
+ class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
56
+ pass
57
+
58
+
59
+ @dataclass
60
+ class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
61
+ connection_config: DatabricksAzureVolumesConnectionConfig
62
+ index_config: DatabricksAzureVolumesIndexerConfig
63
+ connector_type: str = CONNECTOR_TYPE
64
+
65
+
66
+ class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
67
+ pass
68
+
69
+
70
+ @dataclass
71
+ class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
72
+ connection_config: DatabricksAzureVolumesConnectionConfig
73
+ download_config: DatabricksVolumesDownloaderConfig
74
+ connector_type: str = CONNECTOR_TYPE
75
+
76
+
77
+ class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
78
+ pass
79
+
80
+
81
+ @dataclass
82
+ class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
83
+ connection_config: DatabricksAzureVolumesConnectionConfig
84
+ upload_config: DatabricksAzureVolumesUploaderConfig = field(
85
+ default_factory=DatabricksAzureVolumesUploaderConfig
86
+ )
87
+ connector_type: str = CONNECTOR_TYPE
88
+
89
+
90
+ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
+ connection_config=DatabricksAzureVolumesConnectionConfig,
92
+ uploader=DatabricksAzureVolumesUploader,
93
+ uploader_config=DatabricksAzureVolumesUploaderConfig,
94
+ )
95
+
96
+ databricks_azure_volumes_source_entry = SourceRegistryEntry(
97
+ connection_config=DatabricksAzureVolumesConnectionConfig,
98
+ indexer=DatabricksAzureVolumesIndexer,
99
+ indexer_config=DatabricksAzureVolumesIndexerConfig,
100
+ downloader=DatabricksAzureVolumesDownloader,
101
+ downloader_config=DatabricksAzureVolumesDownloaderConfig,
102
+ )
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes_gcp"
22
+
23
+
24
+ class DatabricksGoogleVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
+ account_id: Optional[str] = Field(
26
+ default=None,
27
+ description="The Databricks account ID for the Databricks " "accounts endpoint.",
28
+ )
29
+ profile: Optional[str] = None
30
+ google_credentials: Optional[str] = None
31
+ google_service_account: Optional[str] = None
32
+
33
+
34
+ class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
35
+ access_config: Secret[DatabricksGoogleVolumesAccessConfig]
36
+
37
+
38
+ class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
39
+ pass
40
+
41
+
42
+ @dataclass
43
+ class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
44
+ connection_config: DatabricksGoogleVolumesConnectionConfig
45
+ index_config: DatabricksGoogleVolumesIndexerConfig
46
+ connector_type: str = CONNECTOR_TYPE
47
+
48
+
49
+ class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
50
+ pass
51
+
52
+
53
+ @dataclass
54
+ class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
55
+ connection_config: DatabricksGoogleVolumesConnectionConfig
56
+ download_config: DatabricksVolumesDownloaderConfig
57
+ connector_type: str = CONNECTOR_TYPE
58
+
59
+
60
+ class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
61
+ pass
62
+
63
+
64
+ @dataclass
65
+ class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
66
+ connection_config: DatabricksGoogleVolumesConnectionConfig
67
+ upload_config: DatabricksGoogleVolumesUploaderConfig = field(
68
+ default_factory=DatabricksGoogleVolumesUploaderConfig
69
+ )
70
+ connector_type: str = CONNECTOR_TYPE
71
+
72
+
73
+ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
75
+ uploader=DatabricksGoogleVolumesUploader,
76
+ uploader_config=DatabricksGoogleVolumesUploaderConfig,
77
+ )
78
+
79
+ databricks_gcp_volumes_source_entry = SourceRegistryEntry(
80
+ connection_config=DatabricksGoogleVolumesConnectionConfig,
81
+ indexer=DatabricksGoogleVolumesIndexer,
82
+ indexer_config=DatabricksGoogleVolumesIndexerConfig,
83
+ downloader=DatabricksGoogleVolumesDownloader,
84
+ downloader_config=DatabricksGoogleVolumesDownloaderConfig,
85
+ )
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
+ DatabricksVolumesConnectionConfig,
13
+ DatabricksVolumesDownloader,
14
+ DatabricksVolumesDownloaderConfig,
15
+ DatabricksVolumesIndexer,
16
+ DatabricksVolumesIndexerConfig,
17
+ DatabricksVolumesUploader,
18
+ DatabricksVolumesUploaderConfig,
19
+ )
20
+
21
+ CONNECTOR_TYPE = "databricks_volumes"
22
+
23
+
24
+ class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
26
+ client_secret: Optional[str] = Field(
27
+ default=None, description="Client Secret of the OAuth app."
28
+ )
29
+ profile: Optional[str] = None
30
+ azure_workspace_resource_id: Optional[str] = Field(
31
+ default=None,
32
+ description="The Azure Resource Manager ID for the Azure Databricks workspace, "
33
+ "which is exchanged for a Databricks host URL.",
34
+ )
35
+
36
+
37
+ class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
38
+ access_config: Secret[DatabricksNativeVolumesAccessConfig]
39
+
40
+
41
+ class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
42
+ pass
43
+
44
+
45
+ @dataclass
46
+ class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
47
+ connection_config: DatabricksNativeVolumesConnectionConfig
48
+ index_config: DatabricksNativeVolumesIndexerConfig
49
+ connector_type: str = CONNECTOR_TYPE
50
+
51
+
52
+ class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
53
+ pass
54
+
55
+
56
+ @dataclass
57
+ class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
58
+ connection_config: DatabricksNativeVolumesConnectionConfig
59
+ download_config: DatabricksVolumesDownloaderConfig
60
+ connector_type: str = CONNECTOR_TYPE
61
+
62
+
63
+ class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
64
+ pass
65
+
66
+
67
+ @dataclass
68
+ class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
69
+ connection_config: DatabricksNativeVolumesConnectionConfig
70
+ upload_config: DatabricksNativeVolumesUploaderConfig
71
+ connector_type: str = CONNECTOR_TYPE
72
+
73
+
74
+ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
+ connection_config=DatabricksNativeVolumesConnectionConfig,
76
+ uploader=DatabricksNativeVolumesUploader,
77
+ uploader_config=DatabricksNativeVolumesUploaderConfig,
78
+ )
79
+
80
+ databricks_native_volumes_source_entry = SourceRegistryEntry(
81
+ connection_config=DatabricksNativeVolumesConnectionConfig,
82
+ indexer=DatabricksNativeVolumesIndexer,
83
+ indexer_config=DatabricksNativeVolumesIndexerConfig,
84
+ downloader=DatabricksNativeVolumesDownloader,
85
+ downloader_config=DatabricksNativeVolumesDownloaderConfig,
86
+ )
@@ -0,0 +1,191 @@
1
+ import json
2
+ import os
3
+ import traceback
4
+ from dataclasses import dataclass, field
5
+ from multiprocessing import Process, Queue
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+ from urllib.parse import urlparse
9
+
10
+ import pandas as pd
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.error import DestinationConnectionError
14
+ from unstructured_ingest.utils.data_prep import get_data_df
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+ from unstructured_ingest.utils.table import convert_to_pandas_dataframe
17
+ from unstructured_ingest.v2.interfaces import (
18
+ AccessConfig,
19
+ ConnectionConfig,
20
+ FileData,
21
+ Uploader,
22
+ UploaderConfig,
23
+ UploadStager,
24
+ UploadStagerConfig,
25
+ )
26
+ from unstructured_ingest.v2.logger import logger
27
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
28
+
29
+ CONNECTOR_TYPE = "delta_table"
30
+
31
+
32
+ @requires_dependencies(["deltalake"], extras="delta-table")
33
+ def write_deltalake_with_error_handling(queue, **kwargs):
34
+ from deltalake.writer import write_deltalake
35
+
36
+ try:
37
+ write_deltalake(**kwargs)
38
+ except Exception:
39
+ queue.put(traceback.format_exc())
40
+
41
+
42
+ class DeltaTableAccessConfig(AccessConfig):
43
+ aws_access_key_id: Optional[str] = Field(default=None, description="AWS Access Key Id")
44
+ aws_secret_access_key: Optional[str] = Field(default=None, description="AWS Secret Access Key")
45
+
46
+
47
+ class DeltaTableConnectionConfig(ConnectionConfig):
48
+ access_config: Secret[DeltaTableAccessConfig] = Field(
49
+ default=DeltaTableAccessConfig(), validate_default=True
50
+ )
51
+ aws_region: Optional[str] = Field(default=None, description="AWS Region")
52
+ table_uri: str = Field(
53
+ default=None,
54
+ description=(
55
+ "Local path or path to the target folder in the S3 bucket, "
56
+ "formatted as s3://my-bucket/my-folder/"
57
+ ),
58
+ )
59
+
60
+ def update_storage_options(self, storage_options: dict) -> None:
61
+ secrets = self.access_config.get_secret_value()
62
+ if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
63
+ storage_options["AWS_REGION"] = self.aws_region
64
+ storage_options["AWS_ACCESS_KEY_ID"] = secrets.aws_access_key_id
65
+ storage_options["AWS_SECRET_ACCESS_KEY"] = secrets.aws_secret_access_key
66
+ # Delta-rs doesn't support concurrent S3 writes without external locks (DynamoDB).
67
+ # This flag allows single-writer uploads to S3 without using locks, according to:
68
+ # https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/
69
+ storage_options["AWS_S3_ALLOW_UNSAFE_RENAME"] = "true"
70
+
71
+
72
+ class DeltaTableUploadStagerConfig(UploadStagerConfig):
73
+ pass
74
+
75
+
76
+ @dataclass
77
+ class DeltaTableUploadStager(UploadStager):
78
+ upload_stager_config: DeltaTableUploadStagerConfig = field(
79
+ default_factory=lambda: DeltaTableUploadStagerConfig()
80
+ )
81
+
82
+ def run(
83
+ self,
84
+ elements_filepath: Path,
85
+ output_dir: Path,
86
+ output_filename: str,
87
+ **kwargs: Any,
88
+ ) -> Path:
89
+ with open(elements_filepath) as elements_file:
90
+ elements_contents = json.load(elements_file)
91
+
92
+ output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
93
+
94
+ df = convert_to_pandas_dataframe(elements_dict=elements_contents)
95
+ df.to_parquet(output_path)
96
+
97
+ return output_path
98
+
99
+
100
+ class DeltaTableUploaderConfig(UploaderConfig):
101
+ pass
102
+
103
+
104
+ @dataclass
105
+ class DeltaTableUploader(Uploader):
106
+ upload_config: DeltaTableUploaderConfig
107
+ connection_config: DeltaTableConnectionConfig
108
+ connector_type: str = CONNECTOR_TYPE
109
+
110
+ @requires_dependencies(["boto3"], extras="delta-table")
111
+ def precheck(self):
112
+ secrets = self.connection_config.access_config.get_secret_value()
113
+ if (
114
+ self.connection_config.aws_region
115
+ and secrets.aws_access_key_id
116
+ and secrets.aws_secret_access_key
117
+ ):
118
+ from boto3 import client
119
+
120
+ url = urlparse(self.connection_config.table_uri)
121
+ bucket_name = url.netloc
122
+ dir_path = url.path.lstrip("/")
123
+
124
+ try:
125
+ s3_client = client(
126
+ "s3",
127
+ aws_access_key_id=secrets.aws_access_key_id,
128
+ aws_secret_access_key=secrets.aws_secret_access_key,
129
+ )
130
+ s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
131
+
132
+ response = s3_client.get_bucket_location(Bucket=bucket_name)
133
+
134
+ if self.connection_config.aws_region != response.get("LocationConstraint"):
135
+ raise ValueError("Wrong AWS Region was provided.")
136
+
137
+ except Exception as e:
138
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
139
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
140
+
141
+ def upload_dataframe(self, df: pd.DataFrame, file_data: FileData) -> None:
142
+ updated_upload_path = os.path.join(
143
+ self.connection_config.table_uri, file_data.source_identifiers.relative_path
144
+ )
145
+ logger.info(
146
+ f"writing {len(df)} rows to destination table "
147
+ f"at {updated_upload_path}\ndtypes: {df.dtypes}",
148
+ )
149
+ storage_options = {}
150
+ self.connection_config.update_storage_options(storage_options=storage_options)
151
+
152
+ writer_kwargs = {
153
+ "table_or_uri": updated_upload_path,
154
+ "data": df,
155
+ "mode": "overwrite",
156
+ "storage_options": storage_options,
157
+ }
158
+ queue = Queue()
159
+ # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
160
+ # ingest to fail, even though all tasks are completed normally. Putting the writer into a
161
+ # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
162
+ # rust backend to finish
163
+ writer = Process(
164
+ target=write_deltalake_with_error_handling,
165
+ kwargs={"queue": queue, **writer_kwargs},
166
+ )
167
+ writer.start()
168
+ writer.join()
169
+
170
+ # Check if the queue has any exception message
171
+ if not queue.empty():
172
+ error_message = queue.get()
173
+ logger.error(f"Exception occurred in write_deltalake: {error_message}")
174
+ raise RuntimeError(f"Error in write_deltalake: {error_message}")
175
+
176
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
177
+ df = pd.DataFrame(data=data)
178
+ self.upload_dataframe(df=df, file_data=file_data)
179
+
180
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
+ df = get_data_df(path)
182
+ self.upload_dataframe(df=df, file_data=file_data)
183
+
184
+
185
+ delta_table_destination_entry = DestinationRegistryEntry(
186
+ connection_config=DeltaTableConnectionConfig,
187
+ uploader=DeltaTableUploader,
188
+ uploader_config=DeltaTableUploaderConfig,
189
+ upload_stager=DeltaTableUploadStager,
190
+ upload_stager_config=DeltaTableUploadStagerConfig,
191
+ )
@@ -0,0 +1,158 @@
1
+ import datetime as dt
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import SourceConnectionError
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import (
10
+ AccessConfig,
11
+ ConnectionConfig,
12
+ Downloader,
13
+ DownloaderConfig,
14
+ DownloadResponse,
15
+ FileData,
16
+ FileDataSourceMetadata,
17
+ Indexer,
18
+ IndexerConfig,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
23
+
24
+ if TYPE_CHECKING:
25
+ from discord import Client as DiscordClient
26
+
27
+ CONNECTOR_TYPE = "discord"
28
+
29
+
30
+ class DiscordAccessConfig(AccessConfig):
31
+ token: str = Field(description="Discord API token")
32
+
33
+
34
+ class DiscordConnectionConfig(ConnectionConfig):
35
+ access_config: Secret[DiscordAccessConfig] = Field(
36
+ default=DiscordAccessConfig, validate_default=True
37
+ )
38
+
39
+ @requires_dependencies(["discord"], extras="discord")
40
+ def get_client(self) -> "DiscordClient":
41
+ import discord
42
+
43
+ intents = discord.Intents.default()
44
+ intents.message_content = True
45
+ return discord.Client(intents=intents)
46
+
47
+
48
+ class DiscordIndexerConfig(IndexerConfig):
49
+ channels: list[str] = Field(
50
+ default=None,
51
+ description="List of Discord channel IDs to process",
52
+ )
53
+
54
+
55
+ @dataclass
56
+ class DiscordIndexer(Indexer):
57
+ connection_config: DiscordConnectionConfig
58
+ index_config: DiscordIndexerConfig
59
+
60
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
61
+ self.connection_config.get_client()
62
+ channels_to_process: set[str] = set(self.index_config.channels or [])
63
+
64
+ for channel_id in list(channels_to_process):
65
+ file_data = self.get_channel_file_data(channel_id=channel_id)
66
+ if file_data:
67
+ yield file_data
68
+
69
+ def precheck(self) -> None:
70
+ if not self.connection_config.access_config.get_secret_value().token:
71
+ raise SourceConnectionError("Discord token is missing")
72
+ if not self.index_config.channels:
73
+ raise SourceConnectionError("No channels provided")
74
+
75
+ def get_channel_file_data(self, channel_id: str) -> Optional[FileData]:
76
+ # Fetch channel metadata
77
+ identifier = channel_id
78
+ channel_id = f"{channel_id}.txt"
79
+ source_identifiers = SourceIdentifiers(
80
+ filename=channel_id,
81
+ fullpath=channel_id,
82
+ )
83
+ metadata = FileDataSourceMetadata(
84
+ record_locator={"channel_id": identifier},
85
+ date_processed=str(dt.datetime.utcnow().isoformat()),
86
+ )
87
+ return FileData(
88
+ identifier=identifier,
89
+ connector_type=CONNECTOR_TYPE,
90
+ source_identifiers=source_identifiers,
91
+ metadata=metadata,
92
+ )
93
+
94
+
95
+ class DiscordDownloaderConfig(DownloaderConfig):
96
+ limit: Optional[int] = Field(
97
+ default=100, description="Limit on how many messages per channel to pull in"
98
+ )
99
+
100
+
101
+ @dataclass
102
+ class DiscordDownloader(Downloader):
103
+ connection_config: DiscordConnectionConfig
104
+ download_config: DiscordDownloaderConfig
105
+ connector_type: str = CONNECTOR_TYPE
106
+
107
+ def is_async(self) -> bool:
108
+ return True
109
+
110
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
111
+ # Synchronous run is not implemented
112
+ raise NotImplementedError()
113
+
114
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
115
+ record_locator = file_data.metadata.record_locator
116
+
117
+ if "channel_id" not in record_locator:
118
+ raise ValueError(f"No channel id in file data record locator: {record_locator}")
119
+
120
+ client = self.connection_config.get_client()
121
+ download_path = self.get_download_path(file_data=file_data)
122
+ download_path.parent.mkdir(parents=True, exist_ok=True)
123
+
124
+ messages = []
125
+ channel_id = record_locator["channel_id"]
126
+
127
+ @client.event
128
+ async def on_ready():
129
+ logger.debug("Discord Bot is ready")
130
+ channel = client.get_channel(int(channel_id))
131
+ if not channel:
132
+ raise ValueError(f"channel not found for id: {channel_id}")
133
+ logger.debug(f"Processing messages for channel: {channel.name}")
134
+ async for msg in channel.history(limit=self.download_config.limit):
135
+ messages.append(msg)
136
+ logger.debug(f"Fetched {len(messages)} messages")
137
+ await client.close()
138
+
139
+ try:
140
+ await client.start(self.connection_config.access_config.get_secret_value().token)
141
+ finally:
142
+ await client.close()
143
+
144
+ content = "\n".join([message.content for message in messages])
145
+
146
+ with open(download_path, "w") as file:
147
+ file.write(content)
148
+
149
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
150
+
151
+
152
+ discord_source_entry = SourceRegistryEntry(
153
+ indexer=DiscordIndexer,
154
+ indexer_config=DiscordIndexerConfig,
155
+ downloader=DiscordDownloader,
156
+ downloader_config=DiscordDownloaderConfig,
157
+ connection_config=DiscordConnectionConfig,
158
+ )
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ )
6
+
7
+ from .duckdb import CONNECTOR_TYPE as DUCKDB_CONNECTOR_TYPE
8
+ from .duckdb import duckdb_destination_entry
9
+ from .motherduck import CONNECTOR_TYPE as MOTHERDUCK_CONNECTOR_TYPE
10
+ from .motherduck import motherduck_destination_entry
11
+
12
+ add_destination_entry(destination_type=DUCKDB_CONNECTOR_TYPE, entry=duckdb_destination_entry)
13
+ add_destination_entry(
14
+ destination_type=MOTHERDUCK_CONNECTOR_TYPE, entry=motherduck_destination_entry
15
+ )