unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,402 @@
1
+ import csv
2
+ import hashlib
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from pydantic import BaseModel, Field, Secret
9
+
10
+ from unstructured_ingest import __name__ as integration_name
11
+ from unstructured_ingest.__version__ import __version__ as integration_version
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
17
+ from unstructured_ingest.utils.data_prep import batch_generator, get_data
18
+ from unstructured_ingest.utils.dep_check import requires_dependencies
19
+ from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
20
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
21
+ from unstructured_ingest.v2.interfaces import (
22
+ AccessConfig,
23
+ BatchFileData,
24
+ BatchItem,
25
+ ConnectionConfig,
26
+ Downloader,
27
+ DownloaderConfig,
28
+ DownloadResponse,
29
+ FileData,
30
+ FileDataSourceMetadata,
31
+ Indexer,
32
+ IndexerConfig,
33
+ SourceIdentifiers,
34
+ Uploader,
35
+ UploaderConfig,
36
+ UploadStager,
37
+ UploadStagerConfig,
38
+ download_responses,
39
+ )
40
+ from unstructured_ingest.v2.logger import logger
41
+ from unstructured_ingest.v2.processes.connector_registry import (
42
+ DestinationRegistryEntry,
43
+ SourceRegistryEntry,
44
+ )
45
+
46
+ if TYPE_CHECKING:
47
+ from astrapy import AsyncCollection as AstraDBAsyncCollection
48
+ from astrapy import Collection as AstraDBCollection
49
+ from astrapy import DataAPIClient as AstraDBClient
50
+
51
+
52
+ CONNECTOR_TYPE = "astradb"
53
+
54
+ MAX_CONTENT_PARAM_BYTE_SIZE = 8000
55
+
56
+
57
+ class AstraDBAdditionalMetadata(BaseModel):
58
+ collection_name: str
59
+ keyspace: Optional[str] = None
60
+
61
+
62
+ class AstraDBBatchFileData(BatchFileData):
63
+ additional_metadata: AstraDBAdditionalMetadata
64
+
65
+
66
+ class AstraDBAccessConfig(AccessConfig):
67
+ token: str = Field(description="Astra DB Token with access to the database.")
68
+ api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
69
+
70
+
71
+ class AstraDBConnectionConfig(ConnectionConfig):
72
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
73
+ access_config: Secret[AstraDBAccessConfig]
74
+
75
+ @requires_dependencies(["astrapy"], extras="astradb")
76
+ def get_client(self) -> "AstraDBClient":
77
+ from astrapy import DataAPIClient as AstraDBClient
78
+
79
+ # Create a client object to interact with the Astra DB
80
+ # caller_name/version for Astra DB tracking
81
+ return AstraDBClient(
82
+ caller_name=integration_name,
83
+ caller_version=integration_version,
84
+ )
85
+
86
+
87
+ def get_astra_collection(
88
+ connection_config: AstraDBConnectionConfig,
89
+ collection_name: str,
90
+ keyspace: str,
91
+ ) -> "AstraDBCollection":
92
+ # Build the Astra DB object.
93
+ access_configs = connection_config.access_config.get_secret_value()
94
+
95
+ # Create a client object to interact with the Astra DB
96
+ # caller_name/version for Astra DB tracking
97
+ client = connection_config.get_client()
98
+
99
+ # Get the database object
100
+ astra_db = client.get_database(
101
+ api_endpoint=access_configs.api_endpoint,
102
+ token=access_configs.token,
103
+ keyspace=keyspace,
104
+ )
105
+
106
+ # Connect to the collection
107
+ astra_db_collection = astra_db.get_collection(name=collection_name)
108
+ return astra_db_collection
109
+
110
+
111
+ async def get_async_astra_collection(
112
+ connection_config: AstraDBConnectionConfig,
113
+ collection_name: str,
114
+ keyspace: str,
115
+ ) -> "AstraDBAsyncCollection":
116
+ # Build the Astra DB object.
117
+ access_configs = connection_config.access_config.get_secret_value()
118
+
119
+ # Create a client object to interact with the Astra DB
120
+ client = connection_config.get_client()
121
+
122
+ # Get the async database object
123
+ async_astra_db = client.get_async_database(
124
+ api_endpoint=access_configs.api_endpoint,
125
+ token=access_configs.token,
126
+ keyspace=keyspace,
127
+ )
128
+
129
+ # Get async collection from AsyncDatabase
130
+ async_astra_db_collection = await async_astra_db.get_collection(name=collection_name)
131
+ return async_astra_db_collection
132
+
133
+
134
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
135
+ pass
136
+
137
+
138
+ class AstraDBIndexerConfig(IndexerConfig):
139
+ collection_name: str = Field(
140
+ description="The name of the Astra DB collection. "
141
+ "Note that the collection name must only include letters, "
142
+ "numbers, and underscores."
143
+ )
144
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
145
+ batch_size: int = Field(default=20, description="Number of records per batch")
146
+
147
+
148
+ class AstraDBDownloaderConfig(DownloaderConfig):
149
+ fields: list[str] = field(default_factory=list)
150
+
151
+
152
+ class AstraDBUploaderConfig(UploaderConfig):
153
+ collection_name: str = Field(
154
+ description="The name of the Astra DB collection. "
155
+ "Note that the collection name must only include letters, "
156
+ "numbers, and underscores."
157
+ )
158
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
159
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
160
+ default=None,
161
+ description="The indexing policy to use for the collection.",
162
+ examples=['{"deny": ["metadata"]}'],
163
+ )
164
+ batch_size: int = Field(default=20, description="Number of records per batch")
165
+ record_id_key: str = Field(
166
+ default=RECORD_ID_LABEL,
167
+ description="searchable key to find entries for the same record on previous runs",
168
+ )
169
+
170
+
171
+ @dataclass
172
+ class AstraDBIndexer(Indexer):
173
+ connection_config: AstraDBConnectionConfig
174
+ index_config: AstraDBIndexerConfig
175
+
176
+ def get_collection(self) -> "AstraDBCollection":
177
+ return get_astra_collection(
178
+ connection_config=self.connection_config,
179
+ collection_name=self.index_config.collection_name,
180
+ keyspace=self.index_config.keyspace,
181
+ )
182
+
183
+ def precheck(self) -> None:
184
+ try:
185
+ self.get_collection().options()
186
+ except Exception as e:
187
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
188
+ raise SourceConnectionError(f"failed to validate connection: {e}")
189
+
190
+ def _get_doc_ids(self) -> set[str]:
191
+ """Fetches all document ids in an index"""
192
+ # Get the collection
193
+ collection = self.get_collection()
194
+
195
+ # Perform the find operation to get all items
196
+ astra_db_docs_cursor = collection.find({}, projection={"_id": True})
197
+
198
+ # Iterate over the cursor
199
+ astra_db_docs = []
200
+ for result in astra_db_docs_cursor:
201
+ astra_db_docs.append(result)
202
+
203
+ # Create file data for each astra record
204
+ ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
205
+
206
+ return set(ids)
207
+
208
+ def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
209
+ all_ids = self._get_doc_ids()
210
+ ids = list(all_ids)
211
+ id_batches = batch_generator(ids, self.index_config.batch_size)
212
+
213
+ for batch in id_batches:
214
+ fd = AstraDBBatchFileData(
215
+ connector_type=CONNECTOR_TYPE,
216
+ metadata=FileDataSourceMetadata(
217
+ date_processed=str(time()),
218
+ ),
219
+ additional_metadata=AstraDBAdditionalMetadata(
220
+ collection_name=self.index_config.collection_name,
221
+ keyspace=self.index_config.keyspace,
222
+ ),
223
+ batch_items=[BatchItem(identifier=b) for b in batch],
224
+ )
225
+ yield fd
226
+
227
+
228
+ @dataclass
229
+ class AstraDBDownloader(Downloader):
230
+ connection_config: AstraDBConnectionConfig
231
+ download_config: AstraDBDownloaderConfig
232
+ connector_type: str = CONNECTOR_TYPE
233
+
234
+ def is_async(self) -> bool:
235
+ return True
236
+
237
+ def get_identifier(self, record_id: str) -> str:
238
+ f = f"{record_id}"
239
+ if self.download_config.fields:
240
+ f = "{}-{}".format(
241
+ f,
242
+ hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
243
+ )
244
+ return f
245
+
246
+ def write_astra_result_to_csv(self, astra_result: dict, download_path: str) -> None:
247
+ with open(download_path, "w", encoding="utf8") as f:
248
+ writer = csv.writer(f)
249
+ writer.writerow(astra_result.keys())
250
+ writer.writerow(astra_result.values())
251
+
252
+ def generate_download_response(
253
+ self, result: dict, file_data: AstraDBBatchFileData
254
+ ) -> DownloadResponse:
255
+ record_id = result["_id"]
256
+ filename_id = self.get_identifier(record_id=record_id)
257
+ filename = f"{filename_id}.csv" # csv to preserve column info
258
+ download_path = self.download_dir / Path(filename)
259
+ logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
260
+ download_path.parent.mkdir(parents=True, exist_ok=True)
261
+ try:
262
+ self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
263
+ except Exception as e:
264
+ logger.error(
265
+ f"failed to download from record {record_id} to {download_path}: {e}",
266
+ exc_info=True,
267
+ )
268
+ raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
269
+
270
+ # modify input file_data for download_response
271
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
272
+ cast_file_data = FileData.cast(file_data=file_data)
273
+ cast_file_data.identifier = filename
274
+ cast_file_data.metadata.date_processed = str(time())
275
+ cast_file_data.metadata.record_locator = {"document_id": record_id}
276
+ return super().generate_download_response(
277
+ file_data=cast_file_data, download_path=download_path
278
+ )
279
+
280
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
281
+ raise NotImplementedError("Use astradb run_async instead")
282
+
283
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
284
+ # Get metadata from file_data
285
+ astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
286
+ ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
287
+ collection_name: str = astra_file_data.additional_metadata.collection_name
288
+ keyspace: str = astra_file_data.additional_metadata.keyspace
289
+
290
+ # Retrieve results from async collection
291
+ download_responses = []
292
+ async_astra_collection = await get_async_astra_collection(
293
+ connection_config=self.connection_config,
294
+ collection_name=collection_name,
295
+ keyspace=keyspace,
296
+ )
297
+ async for result in async_astra_collection.find({"_id": {"$in": ids}}):
298
+ download_responses.append(
299
+ self.generate_download_response(result=result, file_data=astra_file_data)
300
+ )
301
+ return download_responses
302
+
303
+
304
+ @dataclass
305
+ class AstraDBUploadStager(UploadStager):
306
+ upload_stager_config: AstraDBUploadStagerConfig = field(
307
+ default_factory=lambda: AstraDBUploadStagerConfig()
308
+ )
309
+
310
+ def truncate_dict_elements(self, element_dict: dict) -> None:
311
+ text = element_dict.pop("text", None)
312
+ if text is not None:
313
+ element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
314
+ metadata = element_dict.get("metadata")
315
+ if metadata is not None and isinstance(metadata, dict):
316
+ text_as_html = element_dict["metadata"].pop("text_as_html", None)
317
+ if text_as_html is not None:
318
+ element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
319
+ text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
320
+ )
321
+
322
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
323
+ self.truncate_dict_elements(element_dict)
324
+ return {
325
+ "$vector": element_dict.pop("embeddings", None),
326
+ "content": element_dict.pop("text", None),
327
+ RECORD_ID_LABEL: file_data.identifier,
328
+ "metadata": element_dict,
329
+ }
330
+
331
+
332
+ @dataclass
333
+ class AstraDBUploader(Uploader):
334
+ connection_config: AstraDBConnectionConfig
335
+ upload_config: AstraDBUploaderConfig
336
+ connector_type: str = CONNECTOR_TYPE
337
+
338
+ def precheck(self) -> None:
339
+ try:
340
+ get_astra_collection(
341
+ connection_config=self.connection_config,
342
+ collection_name=self.upload_config.collection_name,
343
+ keyspace=self.upload_config.keyspace,
344
+ ).options()
345
+ except Exception as e:
346
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
347
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
348
+
349
+ @requires_dependencies(["astrapy"], extras="astradb")
350
+ def get_collection(self) -> "AstraDBCollection":
351
+ return get_astra_collection(
352
+ connection_config=self.connection_config,
353
+ collection_name=self.upload_config.collection_name,
354
+ keyspace=self.upload_config.keyspace,
355
+ )
356
+
357
+ def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
358
+ logger.debug(
359
+ f"deleting records from collection {collection.name} "
360
+ f"with {self.upload_config.record_id_key} "
361
+ f"set to {file_data.identifier}"
362
+ )
363
+ delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
364
+ delete_resp = collection.delete_many(filter=delete_filter)
365
+ logger.debug(
366
+ f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
367
+ )
368
+
369
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
370
+ logger.info(
371
+ f"writing {len(data)} objects to destination "
372
+ f"collection {self.upload_config.collection_name}"
373
+ )
374
+
375
+ astra_db_batch_size = self.upload_config.batch_size
376
+ collection = self.get_collection()
377
+
378
+ self.delete_by_record_id(collection=collection, file_data=file_data)
379
+
380
+ for chunk in batch_generator(data, astra_db_batch_size):
381
+ collection.insert_many(chunk)
382
+
383
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
384
+ data = get_data(path=path)
385
+ self.run_data(data=data, file_data=file_data, **kwargs)
386
+
387
+
388
+ astra_db_source_entry = SourceRegistryEntry(
389
+ indexer=AstraDBIndexer,
390
+ indexer_config=AstraDBIndexerConfig,
391
+ downloader=AstraDBDownloader,
392
+ downloader_config=AstraDBDownloaderConfig,
393
+ connection_config=AstraDBConnectionConfig,
394
+ )
395
+
396
+ astra_db_destination_entry = DestinationRegistryEntry(
397
+ connection_config=AstraDBConnectionConfig,
398
+ upload_stager_config=AstraDBUploadStagerConfig,
399
+ upload_stager=AstraDBUploadStager,
400
+ uploader_config=AstraDBUploaderConfig,
401
+ uploader=AstraDBUploader,
402
+ )
@@ -0,0 +1,276 @@
1
+ import json
2
+ from contextlib import contextmanager
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Any, Generator
5
+
6
+ from pydantic import Field, Secret
7
+
8
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
9
+ from unstructured_ingest.utils.data_prep import batch_generator
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
+ from unstructured_ingest.v2.interfaces import (
13
+ AccessConfig,
14
+ ConnectionConfig,
15
+ FileData,
16
+ Uploader,
17
+ UploaderConfig,
18
+ UploadStager,
19
+ UploadStagerConfig,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ DestinationRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
+ from unstructured_ingest.v2.utils import get_enhanced_element_id
27
+
28
+ if TYPE_CHECKING:
29
+ from azure.search.documents import SearchClient
30
+ from azure.search.documents.indexes import SearchIndexClient
31
+
32
+ CONNECTOR_TYPE = "azure_ai_search"
33
+
34
+
35
+ class AzureAISearchAccessConfig(AccessConfig):
36
+ azure_ai_search_key: str = Field(
37
+ alias="key", description="Credential that is used for authenticating to an Azure service"
38
+ )
39
+
40
+
41
+ class AzureAISearchConnectionConfig(ConnectionConfig):
42
+ endpoint: str = Field(
43
+ description="The URL endpoint of an Azure AI (Cognitive) search service. "
44
+ "In the form of https://{{service_name}}.search.windows.net"
45
+ )
46
+ index: str = Field(
47
+ description="The name of the Azure AI (Cognitive) Search index to connect to."
48
+ )
49
+ access_config: Secret[AzureAISearchAccessConfig]
50
+
51
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
52
+ @contextmanager
53
+ def get_search_client(self) -> Generator["SearchClient", None, None]:
54
+ from azure.core.credentials import AzureKeyCredential
55
+ from azure.search.documents import SearchClient
56
+
57
+ with SearchClient(
58
+ endpoint=self.endpoint,
59
+ index_name=self.index,
60
+ credential=AzureKeyCredential(
61
+ self.access_config.get_secret_value().azure_ai_search_key
62
+ ),
63
+ ) as client:
64
+ yield client
65
+
66
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
67
+ @contextmanager
68
+ def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
69
+ from azure.core.credentials import AzureKeyCredential
70
+ from azure.search.documents.indexes import SearchIndexClient
71
+
72
+ with SearchIndexClient(
73
+ endpoint=self.endpoint,
74
+ credential=AzureKeyCredential(
75
+ self.access_config.get_secret_value().azure_ai_search_key
76
+ ),
77
+ ) as search_index_client:
78
+ yield search_index_client
79
+
80
+
81
+ class AzureAISearchUploadStagerConfig(UploadStagerConfig):
82
+ pass
83
+
84
+
85
+ class AzureAISearchUploaderConfig(UploaderConfig):
86
+ batch_size: int = Field(default=100, description="Number of records per batch")
87
+ record_id_key: str = Field(
88
+ default=RECORD_ID_LABEL,
89
+ description="searchable key to find entries for the same record on previous runs",
90
+ )
91
+
92
+
93
+ @dataclass
94
+ class AzureAISearchUploadStager(UploadStager):
95
+ upload_stager_config: AzureAISearchUploadStagerConfig = field(
96
+ default_factory=lambda: AzureAISearchUploadStagerConfig()
97
+ )
98
+
99
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
100
+ """
101
+ updates the dictionary that is from each Element being converted into a dict/json
102
+ into a dictionary that conforms to the schema expected by the
103
+ Azure Cognitive Search index
104
+ """
105
+ data = element_dict.copy()
106
+ data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
107
+ data[RECORD_ID_LABEL] = file_data.identifier
108
+
109
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
110
+ data["metadata"]["coordinates"]["points"] = json.dumps(points)
111
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
112
+ data["metadata"]["data_source"]["version"] = str(version)
113
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
114
+ data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
115
+ if permissions_data := (
116
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
117
+ ):
118
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
119
+ if links := data.get("metadata", {}).get("links"):
120
+ data["metadata"]["links"] = [json.dumps(link) for link in links]
121
+ if last_modified := data.get("metadata", {}).get("last_modified"):
122
+ data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
123
+ "%Y-%m-%dT%H:%M:%S.%fZ"
124
+ )
125
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
126
+ data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
127
+ "%Y-%m-%dT%H:%M:%S.%fZ"
128
+ )
129
+
130
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
131
+ data["metadata"]["data_source"]["date_modified"] = parse_datetime(
132
+ date_modified
133
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
134
+
135
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
136
+ data["metadata"]["data_source"]["date_processed"] = parse_datetime(
137
+ date_processed
138
+ ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
139
+
140
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
141
+ data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
142
+ if page_number := data.get("metadata", {}).get("page_number"):
143
+ data["metadata"]["page_number"] = str(page_number)
144
+ return data
145
+
146
+
147
+ @dataclass
148
+ class AzureAISearchUploader(Uploader):
149
+ upload_config: AzureAISearchUploaderConfig
150
+ connection_config: AzureAISearchConnectionConfig
151
+ connector_type: str = CONNECTOR_TYPE
152
+
153
+ def query_docs(self, record_id: str, index_key: str) -> list[str]:
154
+ with self.connection_config.get_search_client() as search_client:
155
+ results = list(
156
+ search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
157
+ )
158
+ return [result[index_key] for result in results]
159
+
160
+ def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
161
+ logger.debug(
162
+ f"deleting any content with metadata "
163
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
164
+ f"from azure cognitive search index: {self.connection_config.index}"
165
+ )
166
+ doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
167
+ if not doc_ids_to_delete:
168
+ return
169
+ with self.connection_config.get_search_client() as search_client:
170
+ results = search_client.delete_documents(
171
+ documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
172
+ )
173
+ errors = []
174
+ success = []
175
+ for result in results:
176
+ if result.succeeded:
177
+ success.append(result)
178
+ else:
179
+ errors.append(result)
180
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
181
+ if errors:
182
+ raise WriteError(
183
+ ", ".join(
184
+ [f"[{error.status_code}] {error.error_message}" for error in errors],
185
+ ),
186
+ )
187
+
188
+ @DestinationConnectionError.wrap
189
+ @requires_dependencies(["azure"], extras="azure-ai-search")
190
+ def write_dict(
191
+ self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
192
+ ) -> None:
193
+ import azure.core.exceptions
194
+
195
+ logger.info(
196
+ f"writing {len(elements_dict)} documents to destination "
197
+ f"index at {self.connection_config.index}",
198
+ )
199
+ try:
200
+ results = search_client.upload_documents(documents=elements_dict)
201
+ except azure.core.exceptions.HttpResponseError as http_error:
202
+ raise WriteError(f"http error: {http_error}") from http_error
203
+
204
+ errors = []
205
+ success = []
206
+ for result in results:
207
+ if result.succeeded:
208
+ success.append(result)
209
+ else:
210
+ errors.append(result)
211
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
212
+ if errors:
213
+ raise WriteError(
214
+ ", ".join(
215
+ [
216
+ f"{error.key}: " f"[{error.status_code}] {error.error_message}"
217
+ for error in errors
218
+ ],
219
+ ),
220
+ )
221
+
222
+ def can_delete(self) -> bool:
223
+ with self.connection_config.get_search_index_client() as search_index_client:
224
+ index = search_index_client.get_index(name=self.connection_config.index)
225
+ index_fields = index.fields
226
+ record_id_fields = [
227
+ field for field in index_fields if field.name == self.upload_config.record_id_key
228
+ ]
229
+ if not record_id_fields:
230
+ return False
231
+ record_id_field = record_id_fields[0]
232
+ return record_id_field.filterable
233
+
234
+ def get_index_key(self) -> str:
235
+ with self.connection_config.get_search_index_client() as search_index_client:
236
+ index = search_index_client.get_index(name=self.connection_config.index)
237
+ index_fields = index.fields
238
+ key_fields = [field for field in index_fields if field.key]
239
+ if not key_fields:
240
+ raise ValueError("no key field found in index fields")
241
+ return key_fields[0].name
242
+
243
+ def precheck(self) -> None:
244
+ try:
245
+ with self.connection_config.get_search_client() as search_client:
246
+ search_client.get_document_count()
247
+ except Exception as e:
248
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
249
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
250
+
251
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
252
+ logger.info(
253
+ f"writing document batches to destination"
254
+ f" endpoint at {str(self.connection_config.endpoint)}"
255
+ f" index at {str(self.connection_config.index)}"
256
+ f" with batch size {str(self.upload_config.batch_size)}"
257
+ )
258
+ if self.can_delete():
259
+ index_key = self.get_index_key()
260
+ self.delete_by_record_id(file_data=file_data, index_key=index_key)
261
+ else:
262
+ logger.warning("criteria for deleting previous content not met, skipping")
263
+
264
+ batch_size = self.upload_config.batch_size
265
+ with self.connection_config.get_search_client() as search_client:
266
+ for chunk in batch_generator(data, batch_size):
267
+ self.write_dict(elements_dict=chunk, search_client=search_client) # noqa: E203
268
+
269
+
270
+ azure_ai_search_destination_entry = DestinationRegistryEntry(
271
+ connection_config=AzureAISearchConnectionConfig,
272
+ uploader=AzureAISearchUploader,
273
+ uploader_config=AzureAISearchUploaderConfig,
274
+ upload_stager=AzureAISearchUploadStager,
275
+ upload_stager_config=AzureAISearchUploadStagerConfig,
276
+ )