unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,267 @@
1
+ import json
2
+ from abc import ABC, abstractmethod
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from datetime import date, datetime
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from dateutil import parser
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import DestinationConnectionError, WriteError
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ FileData,
18
+ Uploader,
19
+ UploaderConfig,
20
+ UploadStager,
21
+ UploadStagerConfig,
22
+ )
23
+ from unstructured_ingest.v2.logger import logger
24
+
25
+ if TYPE_CHECKING:
26
+ from weaviate.classes.init import Timeout
27
+ from weaviate.client import WeaviateClient
28
+ from weaviate.collections.batch.client import BatchClient
29
+
30
+ CONNECTOR_TYPE = "weaviate"
31
+
32
+
33
+ class WeaviateAccessConfig(AccessConfig, ABC):
34
+ pass
35
+
36
+
37
+ class WeaviateConnectionConfig(ConnectionConfig, ABC):
38
+ init_timeout: int = Field(default=2, ge=0, description="Timeout for initialization checks")
39
+ insert_timeout: int = Field(default=90, ge=0, description="Timeout for insert operations")
40
+ query_timeout: int = Field(default=30, ge=0, description="Timeout for query operations")
41
+ access_config: Secret[WeaviateAccessConfig] = Field(
42
+ default=WeaviateAccessConfig(), validate_default=True
43
+ )
44
+
45
+ @requires_dependencies(["weaviate"], extras="weaviate")
46
+ def get_timeout(self) -> "Timeout":
47
+ from weaviate.classes.init import Timeout
48
+
49
+ return Timeout(init=self.init_timeout, query=self.query_timeout, insert=self.insert_timeout)
50
+
51
+ @abstractmethod
52
+ @contextmanager
53
+ def get_client(self) -> Generator["WeaviateClient", None, None]:
54
+ pass
55
+
56
+
57
+ class WeaviateUploadStagerConfig(UploadStagerConfig):
58
+ pass
59
+
60
+
61
+ @dataclass
62
+ class WeaviateUploadStager(UploadStager):
63
+ upload_stager_config: WeaviateUploadStagerConfig = field(
64
+ default_factory=lambda: WeaviateUploadStagerConfig()
65
+ )
66
+
67
+ @staticmethod
68
+ def parse_date_string(date_string: str) -> date:
69
+ try:
70
+ timestamp = float(date_string)
71
+ return datetime.fromtimestamp(timestamp)
72
+ except Exception as e:
73
+ logger.debug(f"date {date_string} string not a timestamp: {e}")
74
+ return parser.parse(date_string)
75
+
76
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
77
+ """
78
+ Updates the element dictionary to conform to the Weaviate schema
79
+ """
80
+ data = element_dict.copy()
81
+ working_data = data.copy()
82
+ # Dict as string formatting
83
+ if (
84
+ record_locator := working_data.get("metadata", {})
85
+ .get("data_source", {})
86
+ .get("record_locator")
87
+ ):
88
+ # Explicit casting otherwise fails schema type checking
89
+ working_data["metadata"]["data_source"]["record_locator"] = str(
90
+ json.dumps(record_locator)
91
+ )
92
+
93
+ # Array of items as string formatting
94
+ if points := working_data.get("metadata", {}).get("coordinates", {}).get("points"):
95
+ working_data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
96
+
97
+ if links := working_data.get("metadata", {}).get("links", {}):
98
+ working_data["metadata"]["links"] = str(json.dumps(links))
99
+
100
+ if permissions_data := (
101
+ working_data.get("metadata", {}).get("data_source", {}).get("permissions_data")
102
+ ):
103
+ working_data["metadata"]["data_source"]["permissions_data"] = json.dumps(
104
+ permissions_data
105
+ )
106
+
107
+ # Datetime formatting
108
+ if (
109
+ date_created := working_data.get("metadata", {})
110
+ .get("data_source", {})
111
+ .get("date_created")
112
+ ):
113
+ working_data["metadata"]["data_source"]["date_created"] = self.parse_date_string(
114
+ date_created
115
+ ).strftime(
116
+ "%Y-%m-%dT%H:%M:%S.%fZ",
117
+ )
118
+
119
+ if (
120
+ date_modified := working_data.get("metadata", {})
121
+ .get("data_source", {})
122
+ .get("date_modified")
123
+ ):
124
+ working_data["metadata"]["data_source"]["date_modified"] = self.parse_date_string(
125
+ date_modified
126
+ ).strftime(
127
+ "%Y-%m-%dT%H:%M:%S.%fZ",
128
+ )
129
+
130
+ if (
131
+ date_processed := working_data.get("metadata", {})
132
+ .get("data_source", {})
133
+ .get("date_processed")
134
+ ):
135
+ working_data["metadata"]["data_source"]["date_processed"] = self.parse_date_string(
136
+ date_processed
137
+ ).strftime(
138
+ "%Y-%m-%dT%H:%M:%S.%fZ",
139
+ )
140
+
141
+ if last_modified := working_data.get("metadata", {}).get("last_modified"):
142
+ working_data["metadata"]["last_modified"] = self.parse_date_string(
143
+ last_modified
144
+ ).strftime(
145
+ "%Y-%m-%dT%H:%M:%S.%fZ",
146
+ )
147
+
148
+ # String casting
149
+ if version := working_data.get("metadata", {}).get("data_source", {}).get("version"):
150
+ working_data["metadata"]["data_source"]["version"] = str(version)
151
+
152
+ if page_number := working_data.get("metadata", {}).get("page_number"):
153
+ working_data["metadata"]["page_number"] = str(page_number)
154
+
155
+ if regex_metadata := working_data.get("metadata", {}).get("regex_metadata"):
156
+ working_data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
157
+
158
+ working_data[RECORD_ID_LABEL] = file_data.identifier
159
+ return working_data
160
+
161
+
162
+ class WeaviateUploaderConfig(UploaderConfig):
163
+ collection: str = Field(description="The name of the collection this object belongs to")
164
+ batch_size: Optional[int] = Field(default=None, description="Number of records per batch")
165
+ requests_per_minute: Optional[int] = Field(default=None, description="Rate limit for upload")
166
+ dynamic_batch: bool = Field(default=True, description="Whether to use dynamic batch")
167
+ record_id_key: str = Field(
168
+ default=RECORD_ID_LABEL,
169
+ description="searchable key to find entries for the same record on previous runs",
170
+ )
171
+
172
+ def model_post_init(self, __context: Any) -> None:
173
+ batch_types = {
174
+ "fixed_size": self.batch_size is not None,
175
+ "rate_limited": self.requests_per_minute is not None,
176
+ "dynamic": self.dynamic_batch,
177
+ }
178
+
179
+ enabled_batch_modes = [batch_key for batch_key, flag in batch_types.items() if flag]
180
+ if not enabled_batch_modes:
181
+ raise ValueError("No batch mode enabled")
182
+ if len(enabled_batch_modes) > 1:
183
+ raise ValueError(
184
+ "Multiple batch modes enabled, only one mode can be used: {}".format(
185
+ ", ".join(enabled_batch_modes)
186
+ )
187
+ )
188
+ logger.info(f"Uploader config instantiated with {enabled_batch_modes[0]} batch mode")
189
+
190
+ @contextmanager
191
+ def get_batch_client(self, client: "WeaviateClient") -> Generator["BatchClient", None, None]:
192
+ if self.dynamic_batch:
193
+ with client.batch.dynamic() as batch_client:
194
+ yield batch_client
195
+ elif self.batch_size:
196
+ with client.batch.fixed_size(batch_size=self.batch_size) as batch_client:
197
+ yield batch_client
198
+ elif self.requests_per_minute:
199
+ with client.batch.rate_limit(
200
+ requests_per_minute=self.requests_per_minute
201
+ ) as batch_client:
202
+ yield batch_client
203
+ else:
204
+ raise ValueError("No batch mode enabled")
205
+
206
+
207
+ @dataclass
208
+ class WeaviateUploader(Uploader, ABC):
209
+ upload_config: WeaviateUploaderConfig
210
+ connection_config: WeaviateConnectionConfig
211
+
212
+ def precheck(self) -> None:
213
+ try:
214
+ self.connection_config.get_client()
215
+ except Exception as e:
216
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
217
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
218
+
219
+ def check_for_errors(self, client: "WeaviateClient") -> None:
220
+ failed_uploads = client.batch.failed_objects
221
+ if failed_uploads:
222
+ for failure in failed_uploads:
223
+ logger.error(
224
+ f"Failed to upload object with id {failure.original_uuid}: {failure.message}"
225
+ )
226
+ raise WriteError("Failed to upload to weaviate")
227
+
228
+ @requires_dependencies(["weaviate"], extras="weaviate")
229
+ def delete_by_record_id(self, client: "WeaviateClient", file_data: FileData) -> None:
230
+ from weaviate.classes.query import Filter
231
+
232
+ record_id = file_data.identifier
233
+ collection = client.collections.get(self.upload_config.collection)
234
+ delete_filter = Filter.by_property(name=self.upload_config.record_id_key).equal(
235
+ val=record_id
236
+ )
237
+ # There is a configurable maximum limit (QUERY_MAXIMUM_RESULTS) on the number of
238
+ # objects that can be deleted in a single query (default 10,000). To delete
239
+ # more objects than the limit, re-run the query until nothing is deleted.
240
+ while True:
241
+ resp = collection.data.delete_many(where=delete_filter)
242
+ if resp.failed:
243
+ raise WriteError(
244
+ f"failed to delete records in collection "
245
+ f"{self.upload_config.collection} with record "
246
+ f"id property {record_id}"
247
+ )
248
+ if not resp.failed and not resp.successful:
249
+ break
250
+
251
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
252
+ logger.info(
253
+ f"writing {len(data)} objects to destination "
254
+ f"class {self.connection_config.access_config} "
255
+ )
256
+
257
+ with self.connection_config.get_client() as weaviate_client:
258
+ self.delete_by_record_id(client=weaviate_client, file_data=file_data)
259
+ with self.upload_config.get_batch_client(client=weaviate_client) as batch_client:
260
+ for e in data:
261
+ vector = e.pop("embeddings", None)
262
+ batch_client.add_object(
263
+ collection=self.upload_config.collection,
264
+ properties=e,
265
+ vector=vector,
266
+ )
267
+ self.check_for_errors(client=weaviate_client)
@@ -0,0 +1,195 @@
1
+ import json
2
+ from abc import ABC
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Literal, Optional
6
+
7
+ from pydantic import BaseModel, Field, SecretStr
8
+
9
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
10
+
11
+ if TYPE_CHECKING:
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
13
+
14
+
15
+ class EmbedderConfig(BaseModel):
16
+ embedding_provider: Optional[
17
+ Literal[
18
+ "openai",
19
+ "azure-openai",
20
+ "huggingface",
21
+ "aws-bedrock",
22
+ "vertexai",
23
+ "voyageai",
24
+ "octoai",
25
+ "mixedbread-ai",
26
+ "togetherai",
27
+ ]
28
+ ] = Field(default=None, description="Type of the embedding class to be used.")
29
+ embedding_api_key: Optional[SecretStr] = Field(
30
+ default=None,
31
+ description="API key for the embedding model, for the case an API key is needed.",
32
+ )
33
+ embedding_model_name: Optional[str] = Field(
34
+ default=None,
35
+ description="Embedding model name, if needed. "
36
+ "Chooses a particular LLM between different options, to embed with it.",
37
+ )
38
+ embedding_aws_access_key_id: Optional[str] = Field(
39
+ default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
40
+ )
41
+ embedding_aws_secret_access_key: Optional[SecretStr] = Field(
42
+ default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
43
+ )
44
+ embedding_aws_region: Optional[str] = Field(
45
+ default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
46
+ )
47
+ embedding_azure_endpoint: Optional[str] = Field(
48
+ default=None,
49
+ description="Your Azure endpoint, including the resource, "
50
+ "e.g. `https://example-resource.azure.openai.com/`",
51
+ )
52
+ embedding_azure_api_version: Optional[str] = Field(
53
+ description="Azure API version", default=None
54
+ )
55
+
56
+ def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
57
+ from unstructured_ingest.embed.huggingface import (
58
+ HuggingFaceEmbeddingConfig,
59
+ HuggingFaceEmbeddingEncoder,
60
+ )
61
+
62
+ return HuggingFaceEmbeddingEncoder(
63
+ config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
64
+ )
65
+
66
+ def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
67
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
68
+
69
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
70
+
71
+ def get_azure_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
72
+ from unstructured_ingest.embed.azure_openai import (
73
+ AzureOpenAIEmbeddingConfig,
74
+ AzureOpenAIEmbeddingEncoder,
75
+ )
76
+
77
+ config_kwargs = {
78
+ "api_key": self.embedding_api_key,
79
+ "azure_endpoint": self.embedding_azure_endpoint,
80
+ }
81
+ if api_version := self.embedding_azure_api_version:
82
+ config_kwargs["api_version"] = api_version
83
+ if model_name := self.embedding_model_name:
84
+ config_kwargs["model_name"] = model_name
85
+
86
+ return AzureOpenAIEmbeddingEncoder(
87
+ config=AzureOpenAIEmbeddingConfig.model_validate(config_kwargs)
88
+ )
89
+
90
+ def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
91
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
92
+
93
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
94
+
95
+ def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
96
+ from unstructured_ingest.embed.bedrock import (
97
+ BedrockEmbeddingConfig,
98
+ BedrockEmbeddingEncoder,
99
+ )
100
+
101
+ return BedrockEmbeddingEncoder(
102
+ config=BedrockEmbeddingConfig(
103
+ aws_access_key_id=self.embedding_aws_access_key_id,
104
+ aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
105
+ region_name=self.embedding_aws_region,
106
+ )
107
+ )
108
+
109
+ def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
110
+ from unstructured_ingest.embed.vertexai import (
111
+ VertexAIEmbeddingConfig,
112
+ VertexAIEmbeddingEncoder,
113
+ )
114
+
115
+ return VertexAIEmbeddingEncoder(
116
+ config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
117
+ )
118
+
119
+ def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
120
+ from unstructured_ingest.embed.voyageai import (
121
+ VoyageAIEmbeddingConfig,
122
+ VoyageAIEmbeddingEncoder,
123
+ )
124
+
125
+ return VoyageAIEmbeddingEncoder(
126
+ config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
127
+ )
128
+
129
+ def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
130
+ from unstructured_ingest.embed.mixedbreadai import (
131
+ MixedbreadAIEmbeddingConfig,
132
+ MixedbreadAIEmbeddingEncoder,
133
+ )
134
+
135
+ return MixedbreadAIEmbeddingEncoder(
136
+ config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
137
+ )
138
+
139
+ def get_togetherai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
140
+ from unstructured_ingest.embed.togetherai import (
141
+ TogetherAIEmbeddingConfig,
142
+ TogetherAIEmbeddingEncoder,
143
+ )
144
+
145
+ return TogetherAIEmbeddingEncoder(
146
+ config=TogetherAIEmbeddingConfig.model_validate(embedding_kwargs)
147
+ )
148
+
149
+ def get_embedder(self) -> "BaseEmbeddingEncoder":
150
+ kwargs: dict[str, Any] = {}
151
+ if self.embedding_api_key:
152
+ kwargs["api_key"] = self.embedding_api_key.get_secret_value()
153
+ if self.embedding_model_name:
154
+ kwargs["model_name"] = self.embedding_model_name
155
+ # TODO make this more dynamic to map to encoder configs
156
+ if self.embedding_provider == "openai":
157
+ return self.get_openai_embedder(embedding_kwargs=kwargs)
158
+
159
+ if self.embedding_provider == "huggingface":
160
+ return self.get_huggingface_embedder(embedding_kwargs=kwargs)
161
+
162
+ if self.embedding_provider == "octoai":
163
+ return self.get_octoai_embedder(embedding_kwargs=kwargs)
164
+
165
+ if self.embedding_provider == "aws-bedrock":
166
+ return self.get_bedrock_embedder()
167
+
168
+ if self.embedding_provider == "vertexai":
169
+ return self.get_vertexai_embedder(embedding_kwargs=kwargs)
170
+
171
+ if self.embedding_provider == "voyageai":
172
+ return self.get_voyageai_embedder(embedding_kwargs=kwargs)
173
+ if self.embedding_provider == "mixedbread-ai":
174
+ return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
175
+ if self.embedding_provider == "togetherai":
176
+ return self.get_togetherai_embedder(embedding_kwargs=kwargs)
177
+ if self.embedding_provider == "azure-openai":
178
+ return self.get_azure_openai_embedder(embedding_kwargs=kwargs)
179
+
180
+ raise ValueError(f"{self.embedding_provider} not a recognized encoder")
181
+
182
+
183
+ @dataclass
184
+ class Embedder(BaseProcess, ABC):
185
+ config: EmbedderConfig
186
+
187
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
188
+ # TODO update base embedder classes to support async
189
+ embedder = self.config.get_embedder()
190
+ with elements_filepath.open("r") as elements_file:
191
+ elements = json.load(elements_file)
192
+ if not elements:
193
+ return [e.to_dict() for e in elements]
194
+ embedded_elements = embedder.embed_documents(elements=elements)
195
+ return embedded_elements
@@ -0,0 +1,60 @@
1
+ import fnmatch
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Callable, Optional
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from unstructured_ingest.v2.interfaces import FileData
9
+ from unstructured_ingest.v2.interfaces.process import BaseProcess
10
+ from unstructured_ingest.v2.logger import logger
11
+
12
+
13
+ class FiltererConfig(BaseModel):
14
+ file_glob: Optional[list[str]] = Field(
15
+ default=None,
16
+ description="file globs to limit which types of " "files are accepted",
17
+ examples=["*.pdf", "*.html"],
18
+ )
19
+ max_file_size: Optional[int] = Field(
20
+ default=None, description="Max file size to process in bytes"
21
+ )
22
+
23
+
24
+ @dataclass
25
+ class Filterer(BaseProcess, ABC):
26
+ config: FiltererConfig = field(default_factory=lambda: FiltererConfig())
27
+ filters: list[Callable[[FileData], bool]] = field(init=False, default_factory=list)
28
+
29
+ def __post_init__(self):
30
+ # Populate the filters based on values in config
31
+ if self.config.file_glob is not None:
32
+ self.filters.append(self.glob_filter)
33
+ if self.config.max_file_size:
34
+ self.filters.append(self.file_size_filter)
35
+
36
+ def is_async(self) -> bool:
37
+ return False
38
+
39
+ def file_size_filter(self, file_data: FileData) -> bool:
40
+ if filesize_bytes := file_data.metadata.filesize_bytes:
41
+ return filesize_bytes <= self.config.max_file_size
42
+ return True
43
+
44
+ def glob_filter(self, file_data: FileData) -> bool:
45
+ patterns = self.config.file_glob
46
+ path = file_data.source_identifiers.fullpath
47
+ for pattern in patterns:
48
+ if fnmatch.filter([path], pattern):
49
+ return True
50
+ logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
51
+ return False
52
+
53
+ def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
54
+ for filter in self.filters:
55
+ if not filter(file_data):
56
+ logger.debug(
57
+ f"filtered out file data due to {filter.__name__}: {file_data.identifier}"
58
+ )
59
+ return None
60
+ return file_data