unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,273 @@
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from time import time
7
+ from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
16
+ from unstructured_ingest.utils.data_prep import batch_generator
17
+ from unstructured_ingest.utils.dep_check import requires_dependencies
18
+ from unstructured_ingest.v2.interfaces import (
19
+ AccessConfig,
20
+ ConnectionConfig,
21
+ Downloader,
22
+ DownloaderConfig,
23
+ DownloadResponse,
24
+ FileData,
25
+ FileDataSourceMetadata,
26
+ Indexer,
27
+ IndexerConfig,
28
+ SourceIdentifiers,
29
+ Uploader,
30
+ UploaderConfig,
31
+ )
32
+ from unstructured_ingest.v2.logger import logger
33
+
34
+ if TYPE_CHECKING:
35
+ from confluent_kafka import Consumer, Producer
36
+
37
+
38
+ class KafkaAccessConfig(AccessConfig, ABC):
39
+ pass
40
+
41
+
42
+ class KafkaConnectionConfig(ConnectionConfig, ABC):
43
+ access_config: Secret[KafkaAccessConfig]
44
+ bootstrap_server: str
45
+ port: int
46
+ group_id: str = Field(
47
+ description="A consumer group is a way to allow a pool of consumers "
48
+ "to divide the consumption of data over topics and partitions.",
49
+ default="default_group_id",
50
+ )
51
+
52
+ @abstractmethod
53
+ def get_consumer_configuration(self) -> dict:
54
+ pass
55
+
56
+ @abstractmethod
57
+ def get_producer_configuration(self) -> dict:
58
+ pass
59
+
60
+ @contextmanager
61
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
62
+ def get_consumer(self) -> ContextManager["Consumer"]:
63
+ from confluent_kafka import Consumer
64
+
65
+ consumer = Consumer(self.get_consumer_configuration())
66
+ try:
67
+ logger.debug("kafka consumer connected")
68
+ yield consumer
69
+ finally:
70
+ consumer.close()
71
+
72
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
73
+ def get_producer(self) -> "Producer":
74
+ from confluent_kafka import Producer
75
+
76
+ producer = Producer(self.get_producer_configuration())
77
+ return producer
78
+
79
+
80
+ class KafkaIndexerConfig(IndexerConfig):
81
+ topic: str = Field(description="which topic to consume from")
82
+ num_messages_to_consume: Optional[int] = 100
83
+ timeout: Optional[float] = Field(default=3.0, description="polling timeout", ge=3.0)
84
+
85
+ def update_consumer(self, consumer: "Consumer") -> None:
86
+ consumer.subscribe([self.topic])
87
+
88
+
89
+ @dataclass
90
+ class KafkaIndexer(Indexer, ABC):
91
+ connection_config: KafkaConnectionConfig
92
+ index_config: KafkaIndexerConfig
93
+
94
+ @contextmanager
95
+ def get_consumer(self) -> ContextManager["Consumer"]:
96
+ with self.connection_config.get_consumer() as consumer:
97
+ self.index_config.update_consumer(consumer=consumer)
98
+ yield consumer
99
+
100
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
101
+ def generate_messages(self) -> Generator[Any, None, None]:
102
+ from confluent_kafka import KafkaError, KafkaException
103
+
104
+ messages_consumed = 0
105
+ max_empty_polls = 10
106
+ empty_polls = 0
107
+ num_messages_to_consume = self.index_config.num_messages_to_consume
108
+ with self.get_consumer() as consumer:
109
+ while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
110
+ msg = consumer.poll(timeout=self.index_config.timeout)
111
+ if msg is None:
112
+ logger.debug("No Kafka messages found")
113
+ empty_polls += 1
114
+ continue
115
+ if msg.error():
116
+ if msg.error().code() == KafkaError._PARTITION_EOF:
117
+ logger.info(
118
+ "Reached end of partition for topic %s [%d] at offset %d"
119
+ % (msg.topic(), msg.partition(), msg.offset())
120
+ )
121
+ break
122
+ else:
123
+ raise KafkaException(msg.error())
124
+ try:
125
+ empty_polls = 0
126
+ messages_consumed += 1
127
+ yield msg
128
+ finally:
129
+ consumer.commit(asynchronous=False)
130
+
131
+ def generate_file_data(self, msg) -> FileData:
132
+ msg_content = msg.value().decode("utf8")
133
+ identifier = f"{msg.topic()}_{msg.partition()}_{msg.offset()}"
134
+ additional_metadata = {
135
+ "topic": msg.topic(),
136
+ "partition": msg.partition(),
137
+ "offset": msg.offset(),
138
+ "content": msg_content,
139
+ }
140
+ filename = f"{identifier}.txt"
141
+ return FileData(
142
+ identifier=identifier,
143
+ connector_type=self.connector_type,
144
+ source_identifiers=SourceIdentifiers(
145
+ filename=filename,
146
+ fullpath=filename,
147
+ ),
148
+ metadata=FileDataSourceMetadata(
149
+ date_processed=str(time()),
150
+ ),
151
+ additional_metadata=additional_metadata,
152
+ display_name=filename,
153
+ )
154
+
155
+ def run(self) -> Generator[FileData, None, None]:
156
+ for message in self.generate_messages():
157
+ yield self.generate_file_data(message)
158
+
159
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
160
+ raise NotImplementedError()
161
+
162
+ def precheck(self):
163
+ try:
164
+ with self.get_consumer() as consumer:
165
+ # timeout needs at least 3 secs, more info:
166
+ # https://forum.confluent.io/t/kafkacat-connect-failure-to-confcloud-ssl/2513
167
+ cluster_meta = consumer.list_topics(timeout=5)
168
+ current_topics = [
169
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
170
+ ]
171
+ if self.index_config.topic not in current_topics:
172
+ raise SourceConnectionError(
173
+ "expected topic '{}' not detected in cluster: '{}'".format(
174
+ self.index_config.topic, ", ".join(current_topics)
175
+ )
176
+ )
177
+ logger.info(f"successfully checked available topics: {current_topics}")
178
+ except Exception as e:
179
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
180
+ raise SourceConnectionError(f"failed to validate connection: {e}")
181
+
182
+
183
+ class KafkaDownloaderConfig(DownloaderConfig):
184
+ pass
185
+
186
+
187
+ @dataclass
188
+ class KafkaDownloader(Downloader, ABC):
189
+ connection_config: KafkaConnectionConfig
190
+ download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
191
+ version: Optional[str] = None
192
+ source_url: Optional[str] = None
193
+
194
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
195
+ source_identifiers = file_data.source_identifiers
196
+ if source_identifiers is None:
197
+ raise ValueError("FileData is missing source_identifiers")
198
+
199
+ # Build the download path using source_identifiers
200
+ download_path = Path(self.download_dir) / source_identifiers.relative_path
201
+ download_path.parent.mkdir(parents=True, exist_ok=True)
202
+
203
+ try:
204
+ content = file_data.additional_metadata["content"]
205
+ with open(download_path, "w") as file:
206
+ file.write(content)
207
+ except Exception as e:
208
+ logger.error(f"Failed to download file {file_data.identifier}: {e}")
209
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
210
+
211
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
212
+
213
+
214
+ class KafkaUploaderConfig(UploaderConfig):
215
+ batch_size: int = Field(default=100, description="Batch size")
216
+ topic: str = Field(description="which topic to write to")
217
+ timeout: Optional[float] = Field(
218
+ default=10.0, description="Timeout in seconds to flush batch of messages"
219
+ )
220
+
221
+
222
+ @dataclass
223
+ class KafkaUploader(Uploader, ABC):
224
+ connection_config: KafkaConnectionConfig
225
+ upload_config: KafkaUploaderConfig
226
+
227
+ def precheck(self):
228
+ try:
229
+ with self.connection_config.get_consumer() as consumer:
230
+ cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
231
+ current_topics = [
232
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
233
+ ]
234
+ logger.info(f"successfully checked available topics: {current_topics}")
235
+ if self.upload_config.topic not in current_topics:
236
+ raise DestinationConnectionError(
237
+ "expected topic '{}' not detected in cluster: '{}'".format(
238
+ self.upload_config.topic, ", ".join(current_topics)
239
+ )
240
+ )
241
+
242
+ except Exception as e:
243
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
244
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
245
+
246
+ def produce_batch(self, elements: list[dict]) -> None:
247
+ from confluent_kafka.error import KafkaException
248
+
249
+ producer = self.connection_config.get_producer()
250
+ failed_producer = False
251
+
252
+ def acked(err, msg):
253
+ nonlocal failed_producer
254
+ if err is not None:
255
+ failed_producer = True
256
+ logger.error("Failed to deliver kafka message: %s: %s" % (str(msg), str(err)))
257
+
258
+ for element in elements:
259
+ producer.produce(
260
+ topic=self.upload_config.topic,
261
+ value=json.dumps(element),
262
+ callback=acked,
263
+ )
264
+
265
+ while producer_len := len(producer):
266
+ logger.debug(f"another iteration of kafka producer flush. Queue length: {producer_len}")
267
+ producer.flush(timeout=self.upload_config.timeout)
268
+ if failed_producer:
269
+ raise KafkaException("failed to produce all messages in batch")
270
+
271
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
272
+ for element_batch in batch_generator(data, batch_size=self.upload_config.batch_size):
273
+ self.produce_batch(elements=element_batch)
@@ -0,0 +1,103 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, Secret
5
+
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
10
+ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
11
+ KafkaAccessConfig,
12
+ KafkaConnectionConfig,
13
+ KafkaDownloader,
14
+ KafkaDownloaderConfig,
15
+ KafkaIndexer,
16
+ KafkaIndexerConfig,
17
+ KafkaUploader,
18
+ KafkaUploaderConfig,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ pass
23
+
24
+ CONNECTOR_TYPE = "kafka-local"
25
+
26
+
27
+ class LocalKafkaAccessConfig(KafkaAccessConfig):
28
+ pass
29
+
30
+
31
+ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
32
+ access_config: Secret[LocalKafkaAccessConfig] = Field(
33
+ default=LocalKafkaAccessConfig(), validate_default=True
34
+ )
35
+
36
+ def get_consumer_configuration(self) -> dict:
37
+ bootstrap = self.bootstrap_server
38
+ port = self.port
39
+
40
+ conf = {
41
+ "bootstrap.servers": f"{bootstrap}:{port}",
42
+ "group.id": self.group_id,
43
+ "enable.auto.commit": "false",
44
+ "auto.offset.reset": "earliest",
45
+ }
46
+ return conf
47
+
48
+ def get_producer_configuration(self) -> dict:
49
+ bootstrap = self.bootstrap_server
50
+ port = self.port
51
+
52
+ conf = {
53
+ "bootstrap.servers": f"{bootstrap}:{port}",
54
+ }
55
+ return conf
56
+
57
+
58
+ class LocalKafkaIndexerConfig(KafkaIndexerConfig):
59
+ pass
60
+
61
+
62
+ @dataclass
63
+ class LocalKafkaIndexer(KafkaIndexer):
64
+ connection_config: LocalKafkaConnectionConfig
65
+ index_config: LocalKafkaIndexerConfig
66
+ connector_type: str = CONNECTOR_TYPE
67
+
68
+
69
+ class LocalKafkaDownloaderConfig(KafkaDownloaderConfig):
70
+ pass
71
+
72
+
73
+ @dataclass
74
+ class LocalKafkaDownloader(KafkaDownloader):
75
+ connection_config: LocalKafkaConnectionConfig
76
+ download_config: LocalKafkaDownloaderConfig
77
+ connector_type: str = CONNECTOR_TYPE
78
+
79
+
80
+ class LocalKafkaUploaderConfig(KafkaUploaderConfig):
81
+ pass
82
+
83
+
84
+ @dataclass
85
+ class LocalKafkaUploader(KafkaUploader):
86
+ connection_config: LocalKafkaConnectionConfig
87
+ upload_config: LocalKafkaUploaderConfig
88
+ connector_type: str = CONNECTOR_TYPE
89
+
90
+
91
+ kafka_local_source_entry = SourceRegistryEntry(
92
+ connection_config=LocalKafkaConnectionConfig,
93
+ indexer=LocalKafkaIndexer,
94
+ indexer_config=LocalKafkaIndexerConfig,
95
+ downloader=LocalKafkaDownloader,
96
+ downloader_config=LocalKafkaDownloaderConfig,
97
+ )
98
+
99
+ kafka_local_destination_entry = DestinationRegistryEntry(
100
+ connection_config=LocalKafkaConnectionConfig,
101
+ uploader=LocalKafkaUploader,
102
+ uploader_config=LocalKafkaUploaderConfig,
103
+ )
@@ -0,0 +1,148 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Generator, Optional
5
+
6
+ import pandas as pd
7
+ from pydantic import Field, Secret
8
+
9
+ from unstructured_ingest.error import DestinationConnectionError
10
+ from unstructured_ingest.utils.data_prep import flatten_dict, get_data_df, split_dataframe
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ FileData,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
26
+
27
+ if TYPE_CHECKING:
28
+ from kdbai_client import Database, Session, Table
29
+
30
+ CONNECTOR_TYPE = "kdbai"
31
+
32
+
33
+ class KdbaiAccessConfig(AccessConfig):
34
+ api_key: Optional[str] = Field(
35
+ default=None,
36
+ description="A string for the api-key, can be left empty "
37
+ "when connecting to local KDBAI instance.",
38
+ )
39
+
40
+
41
+ class KdbaiConnectionConfig(ConnectionConfig):
42
+ access_config: Secret[KdbaiAccessConfig] = Field(
43
+ default=KdbaiAccessConfig(), validate_default=True
44
+ )
45
+ endpoint: str = Field(
46
+ default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
47
+ )
48
+
49
+ @requires_dependencies(["kdbai_client"], extras="kdbai")
50
+ @contextmanager
51
+ def get_client(self) -> Generator["Session", None, None]:
52
+ from kdbai_client import Session
53
+
54
+ session = None
55
+ try:
56
+ session = Session(
57
+ api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
58
+ )
59
+ yield session
60
+ finally:
61
+ if session:
62
+ session.close()
63
+
64
+
65
+ class KdbaiUploadStagerConfig(UploadStagerConfig):
66
+ pass
67
+
68
+
69
+ @dataclass
70
+ class KdbaiUploadStager(UploadStager):
71
+ upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
72
+
73
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
74
+ data = element_dict.copy()
75
+ return {
76
+ "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
77
+ "element_id": data.get("element_id"),
78
+ "document": data.pop("text", None),
79
+ "embeddings": data.get("embeddings"),
80
+ "metadata": flatten_dict(
81
+ dictionary=data.get("metadata"),
82
+ flatten_lists=True,
83
+ remove_none=True,
84
+ ),
85
+ }
86
+
87
+
88
+ class KdbaiUploaderConfig(UploaderConfig):
89
+ database_name: str = Field(
90
+ default="default", description="The name of the KDBAI database to write into."
91
+ )
92
+ table_name: str = Field(description="The name of the KDBAI table to write into.")
93
+ batch_size: int = Field(default=100, description="Number of records per batch")
94
+
95
+
96
+ @dataclass
97
+ class KdbaiUploader(Uploader):
98
+ connection_config: KdbaiConnectionConfig
99
+ upload_config: KdbaiUploaderConfig
100
+ connector_type: str = field(default=CONNECTOR_TYPE, init=False)
101
+
102
+ def precheck(self) -> None:
103
+ try:
104
+ self.get_database()
105
+ except Exception as e:
106
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
107
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
108
+
109
+ @contextmanager
110
+ def get_database(self) -> Generator["Database", None, None]:
111
+ with self.connection_config.get_client() as client:
112
+ db = client.database(self.upload_config.database_name)
113
+ yield db
114
+
115
+ @contextmanager
116
+ def get_table(self) -> Generator["Table", None, None]:
117
+ with self.get_database() as db:
118
+ table = db.table(self.upload_config.table_name)
119
+ yield table
120
+
121
+ def upsert_batch(self, batch: pd.DataFrame):
122
+ with self.get_table() as table:
123
+ table.insert(batch)
124
+
125
+ def process_dataframe(self, df: pd.DataFrame):
126
+ logger.debug(
127
+ f"uploading {len(df)} entries to {self.connection_config.endpoint} "
128
+ f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
129
+ )
130
+ for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
131
+ self.upsert_batch(batch=batch_df)
132
+
133
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
134
+ df = pd.DataFrame(data=data)
135
+ self.process_dataframe(df=df)
136
+
137
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
138
+ data = get_data_df(path=path)
139
+ self.process_dataframe(df=data)
140
+
141
+
142
+ kdbai_destination_entry = DestinationRegistryEntry(
143
+ connection_config=KdbaiConnectionConfig,
144
+ uploader=KdbaiUploader,
145
+ uploader_config=KdbaiUploaderConfig,
146
+ upload_stager=KdbaiUploadStager,
147
+ upload_stager_config=KdbaiUploadStagerConfig,
148
+ )
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import add_destination_entry
4
+
5
+ from .aws import CONNECTOR_TYPE as LANCEDB_S3_CONNECTOR_TYPE
6
+ from .aws import lancedb_aws_destination_entry
7
+ from .azure import CONNECTOR_TYPE as LANCEDB_AZURE_CONNECTOR_TYPE
8
+ from .azure import lancedb_azure_destination_entry
9
+ from .cloud import CONNECTOR_TYPE as LANCEDB_CLOUD_CONNECTOR_TYPE
10
+ from .cloud import lancedb_cloud_destination_entry
11
+ from .gcp import CONNECTOR_TYPE as LANCEDB_GCS_CONNECTOR_TYPE
12
+ from .gcp import lancedb_gcp_destination_entry
13
+ from .local import CONNECTOR_TYPE as LANCEDB_LOCAL_CONNECTOR_TYPE
14
+ from .local import lancedb_local_destination_entry
15
+
16
+ add_destination_entry(
17
+ destination_type=LANCEDB_S3_CONNECTOR_TYPE, entry=lancedb_aws_destination_entry
18
+ )
19
+ add_destination_entry(
20
+ destination_type=LANCEDB_AZURE_CONNECTOR_TYPE, entry=lancedb_azure_destination_entry
21
+ )
22
+ add_destination_entry(
23
+ destination_type=LANCEDB_GCS_CONNECTOR_TYPE, entry=lancedb_gcp_destination_entry
24
+ )
25
+ add_destination_entry(
26
+ destination_type=LANCEDB_LOCAL_CONNECTOR_TYPE, entry=lancedb_local_destination_entry
27
+ )
28
+ add_destination_entry(
29
+ destination_type=LANCEDB_CLOUD_CONNECTOR_TYPE, entry=lancedb_cloud_destination_entry
30
+ )
@@ -0,0 +1,43 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_aws"
16
+
17
+
18
+ class LanceDBAwsAccessConfig(AccessConfig):
19
+ aws_access_key_id: str = Field(description="The AWS access key ID to use.")
20
+ aws_secret_access_key: str = Field(description="The AWS secret access key to use.")
21
+
22
+
23
+ class LanceDBAwsConnectionConfig(LanceDBRemoteConnectionConfig):
24
+ access_config: Secret[LanceDBAwsAccessConfig]
25
+
26
+ def get_storage_options(self) -> dict:
27
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
28
+
29
+
30
+ @dataclass
31
+ class LanceDBAwsUploader(LanceDBUploader):
32
+ upload_config: LanceDBUploaderConfig
33
+ connection_config: LanceDBAwsConnectionConfig
34
+ connector_type: str = CONNECTOR_TYPE
35
+
36
+
37
+ lancedb_aws_destination_entry = DestinationRegistryEntry(
38
+ connection_config=LanceDBAwsConnectionConfig,
39
+ uploader=LanceDBAwsUploader,
40
+ uploader_config=LanceDBUploaderConfig,
41
+ upload_stager_config=LanceDBUploadStagerConfig,
42
+ upload_stager=LanceDBUploadStager,
43
+ )
@@ -0,0 +1,43 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_azure"
16
+
17
+
18
+ class LanceDBAzureAccessConfig(AccessConfig):
19
+ azure_storage_account_name: str = Field(description="The name of the azure storage account.")
20
+ azure_storage_account_key: str = Field(description="The serialized azure service account key.")
21
+
22
+
23
+ class LanceDBAzureConnectionConfig(LanceDBRemoteConnectionConfig):
24
+ access_config: Secret[LanceDBAzureAccessConfig]
25
+
26
+ def get_storage_options(self) -> dict:
27
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
28
+
29
+
30
+ @dataclass
31
+ class LanceDBAzureUploader(LanceDBUploader):
32
+ upload_config: LanceDBUploaderConfig
33
+ connection_config: LanceDBAzureConnectionConfig
34
+ connector_type: str = CONNECTOR_TYPE
35
+
36
+
37
+ lancedb_azure_destination_entry = DestinationRegistryEntry(
38
+ connection_config=LanceDBAzureConnectionConfig,
39
+ uploader=LanceDBAzureUploader,
40
+ uploader_config=LanceDBUploaderConfig,
41
+ upload_stager_config=LanceDBUploadStagerConfig,
42
+ upload_stager=LanceDBUploadStager,
43
+ )
@@ -0,0 +1,42 @@
1
+ from dataclasses import dataclass
2
+
3
+ from pydantic import Field, Secret
4
+
5
+ from unstructured_ingest.v2.interfaces.connector import AccessConfig
6
+ from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
7
+ from unstructured_ingest.v2.processes.connectors.lancedb.lancedb import (
8
+ LanceDBRemoteConnectionConfig,
9
+ LanceDBUploader,
10
+ LanceDBUploaderConfig,
11
+ LanceDBUploadStager,
12
+ LanceDBUploadStagerConfig,
13
+ )
14
+
15
+ CONNECTOR_TYPE = "lancedb_cloud"
16
+
17
+
18
+ class LanceDBCloudAccessConfig(AccessConfig):
19
+ api_key: str = Field(description="Api key associated with LanceDb cloud")
20
+
21
+
22
+ class LanceDBCloudConnectionConfig(LanceDBRemoteConnectionConfig):
23
+ access_config: Secret[LanceDBCloudAccessConfig]
24
+
25
+ def get_storage_options(self) -> dict:
26
+ return {**self.access_config.get_secret_value().model_dump(), "timeout": self.timeout}
27
+
28
+
29
+ @dataclass
30
+ class LanceDBCloudUploader(LanceDBUploader):
31
+ upload_config: LanceDBUploaderConfig
32
+ connection_config: LanceDBCloudConnectionConfig
33
+ connector_type: str = CONNECTOR_TYPE
34
+
35
+
36
+ lancedb_cloud_destination_entry = DestinationRegistryEntry(
37
+ connection_config=LanceDBCloudConnectionConfig,
38
+ uploader=LanceDBCloudUploader,
39
+ uploader_config=LanceDBUploaderConfig,
40
+ upload_stager_config=LanceDBUploadStagerConfig,
41
+ upload_stager=LanceDBUploadStager,
42
+ )