unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,190 @@
1
+ import copy
2
+ import json
3
+ import typing as t
4
+ from dataclasses import dataclass, field
5
+
6
+ from unstructured_ingest.enhanced_dataclass import enhanced_field
7
+ from unstructured_ingest.enhanced_dataclass.core import _asdict
8
+ from unstructured_ingest.error import DestinationConnectionError, SourceConnectionError
9
+ from unstructured_ingest.interfaces import (
10
+ AccessConfig,
11
+ BaseConnectorConfig,
12
+ BaseDestinationConnector,
13
+ WriteConfig,
14
+ )
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+
18
+ if t.TYPE_CHECKING:
19
+ from weaviate import Client
20
+
21
+
22
+ @dataclass
23
+ class WeaviateAccessConfig(AccessConfig):
24
+ access_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
25
+ refresh_token: t.Optional[str] = enhanced_field(default=None, sensitive=True)
26
+ api_key: t.Optional[str] = enhanced_field(default=None, sensitive=True)
27
+ client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True)
28
+ scope: t.Optional[t.List[str]] = None
29
+ username: t.Optional[str] = None
30
+ password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
31
+ anonymous: bool = False
32
+
33
+
34
+ @dataclass
35
+ class SimpleWeaviateConfig(BaseConnectorConfig):
36
+ access_config: WeaviateAccessConfig
37
+ host_url: str
38
+ class_name: str
39
+
40
+
41
+ @dataclass
42
+ class WeaviateWriteConfig(WriteConfig):
43
+ batch_size: int = 100
44
+
45
+
46
+ @dataclass
47
+ class WeaviateDestinationConnector(BaseDestinationConnector):
48
+ write_config: WeaviateWriteConfig
49
+ connector_config: SimpleWeaviateConfig
50
+ _client: t.Optional["Client"] = field(init=False, default=None)
51
+
52
+ def to_dict(self, **kwargs):
53
+ """
54
+ The _client variable in this dataclass breaks deepcopy due to:
55
+ TypeError: cannot pickle '_thread.lock' object
56
+ When serializing, remove it, meaning client data will need to be reinitialized
57
+ when deserialized
58
+ """
59
+ self_cp = copy.copy(self)
60
+ if hasattr(self_cp, "_client"):
61
+ setattr(self_cp, "_client", None)
62
+ return _asdict(self_cp, **kwargs)
63
+
64
+ @property
65
+ @requires_dependencies(["weaviate"], extras="weaviate")
66
+ def client(self) -> "Client":
67
+ if self._client is None:
68
+ from weaviate import Client
69
+
70
+ auth = self._resolve_auth_method()
71
+ self._client = Client(url=self.connector_config.host_url, auth_client_secret=auth)
72
+ return self._client
73
+
74
+ @requires_dependencies(["weaviate"], extras="weaviate")
75
+ @DestinationConnectionError.wrap
76
+ def initialize(self):
77
+ _ = self.client
78
+
79
+ @requires_dependencies(["weaviate"], extras="weaviate")
80
+ def check_connection(self):
81
+ try:
82
+ _ = self.client
83
+ except Exception as e:
84
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
85
+ raise SourceConnectionError(f"failed to validate connection: {e}")
86
+
87
+ def _resolve_auth_method(self):
88
+ access_configs = self.connector_config.access_config
89
+ if access_configs.anonymous:
90
+ return None
91
+
92
+ if access_configs.access_token:
93
+ from weaviate.auth import AuthBearerToken
94
+
95
+ return AuthBearerToken(
96
+ access_token=access_configs.access_token,
97
+ refresh_token=access_configs.refresh_token,
98
+ )
99
+ elif access_configs.api_key:
100
+ from weaviate.auth import AuthApiKey
101
+
102
+ return AuthApiKey(api_key=access_configs.api_key)
103
+ elif access_configs.client_secret:
104
+ from weaviate.auth import AuthClientCredentials
105
+
106
+ return AuthClientCredentials(
107
+ client_secret=access_configs.client_secret, scope=access_configs.scope
108
+ )
109
+ elif access_configs.username and access_configs.password:
110
+ from weaviate.auth import AuthClientPassword
111
+
112
+ return AuthClientPassword(
113
+ username=access_configs.username,
114
+ password=access_configs.password,
115
+ scope=access_configs.scope,
116
+ )
117
+ return None
118
+
119
+ def conform_dict(self, data: dict) -> None:
120
+ """
121
+ Updates the element dictionary to conform to the Weaviate schema
122
+ """
123
+ from dateutil import parser
124
+
125
+ # Dict as string formatting
126
+ if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
127
+ # Explicit casting otherwise fails schema type checking
128
+ data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
129
+
130
+ # Array of items as string formatting
131
+ if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
132
+ data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
133
+
134
+ if links := data.get("metadata", {}).get("links", {}):
135
+ data["metadata"]["links"] = str(json.dumps(links))
136
+
137
+ if permissions_data := (
138
+ data.get("metadata", {}).get("data_source", {}).get("permissions_data")
139
+ ):
140
+ data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
141
+
142
+ # Datetime formatting
143
+ if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
144
+ data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime(
145
+ "%Y-%m-%dT%H:%M:%S.%fZ",
146
+ )
147
+
148
+ if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
149
+ data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime(
150
+ "%Y-%m-%dT%H:%M:%S.%fZ",
151
+ )
152
+
153
+ if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
154
+ data["metadata"]["data_source"]["date_processed"] = parser.parse(
155
+ date_processed
156
+ ).strftime(
157
+ "%Y-%m-%dT%H:%M:%S.%fZ",
158
+ )
159
+
160
+ if last_modified := data.get("metadata", {}).get("last_modified", {}):
161
+ data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
162
+ "%Y-%m-%dT%H:%M:%S.%fZ",
163
+ )
164
+
165
+ # String casting
166
+ if version := data.get("metadata", {}).get("data_source", {}).get("version"):
167
+ data["metadata"]["data_source"]["version"] = str(version)
168
+
169
+ if page_number := data.get("metadata", {}).get("page_number"):
170
+ data["metadata"]["page_number"] = str(page_number)
171
+
172
+ if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
173
+ data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
174
+
175
+ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
176
+ logger.info(
177
+ f"writing {len(elements_dict)} objects to destination "
178
+ f"class {self.connector_config.class_name} "
179
+ f"at {self.connector_config.host_url}",
180
+ )
181
+
182
+ self.client.batch.configure(batch_size=self.write_config.batch_size)
183
+ with self.client.batch as b:
184
+ for e in elements_dict:
185
+ vector = e.pop("embeddings", None)
186
+ b.add_data_object(
187
+ e,
188
+ self.connector_config.class_name,
189
+ vector=vector,
190
+ )
@@ -0,0 +1,208 @@
1
+ import typing as t
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+
5
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
6
+ from unstructured_ingest.interfaces import (
7
+ BaseConnectorConfig,
8
+ BaseSingleIngestDoc,
9
+ BaseSourceConnector,
10
+ IngestDocCleanupMixin,
11
+ SourceConnectorCleanupMixin,
12
+ SourceMetadata,
13
+ )
14
+ from unstructured_ingest.logger import logger
15
+ from unstructured_ingest.utils.dep_check import requires_dependencies
16
+
17
+ if t.TYPE_CHECKING:
18
+ from wikipedia import WikipediaPage
19
+
20
+
21
+ @dataclass
22
+ class SimpleWikipediaConfig(BaseConnectorConfig):
23
+ page_title: str
24
+ auto_suggest: bool = False
25
+
26
+
27
+ @dataclass
28
+ class WikipediaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
29
+ connector_config: SimpleWikipediaConfig = field(repr=False)
30
+
31
+ @property
32
+ @requires_dependencies(["wikipedia"], extras="wikipedia")
33
+ def page(self) -> "WikipediaPage":
34
+ import wikipedia
35
+
36
+ return wikipedia.page(
37
+ self.connector_config.page_title,
38
+ auto_suggest=self.connector_config.auto_suggest,
39
+ )
40
+
41
+ def get_filename_prefix(self) -> str:
42
+ title: str = str(self.connector_config.page_title)
43
+ title = " ".join(title.split()).replace(" ", "-")
44
+ return title
45
+
46
+ @property
47
+ def filename(self) -> Path:
48
+ raise NotImplementedError()
49
+
50
+ @property
51
+ def text(self) -> str:
52
+ raise NotImplementedError()
53
+
54
+ @property
55
+ def _output_filename(self):
56
+ raise NotImplementedError()
57
+
58
+ @property
59
+ def date_created(self) -> t.Optional[str]:
60
+ return None
61
+
62
+ @property
63
+ def date_modified(self) -> t.Optional[str]:
64
+ return None
65
+
66
+ @property
67
+ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
68
+ return {
69
+ "page_title": self.connector_config.page_title,
70
+ "page_url": self.source_metadata.source_url, # type: ignore
71
+ }
72
+
73
+ def _create_full_tmp_dir_path(self):
74
+ self.filename.parent.mkdir(parents=True, exist_ok=True)
75
+
76
+ @requires_dependencies(["wikipedia"], extras="wikipedia")
77
+ def update_source_metadata(self):
78
+ from wikipedia.exceptions import PageError
79
+
80
+ try:
81
+ page = self.page
82
+ except PageError:
83
+ self.source_metadata = SourceMetadata(
84
+ exists=False,
85
+ )
86
+ return
87
+
88
+ self.source_metadata = SourceMetadata(
89
+ version=page.revision_id,
90
+ source_url=page.url,
91
+ exists=True,
92
+ )
93
+
94
+ @SourceConnectionError.wrap
95
+ @BaseSingleIngestDoc.skip_if_file_exists
96
+ def get_file(self):
97
+ """Fetches the "remote" doc and stores it locally on the filesystem."""
98
+ self._create_full_tmp_dir_path()
99
+ self.update_source_metadata()
100
+ with open(self.filename, "w", encoding="utf8") as f:
101
+ f.write(self.text)
102
+
103
+
104
+ @dataclass
105
+ class WikipediaIngestHTMLDoc(WikipediaIngestDoc):
106
+ registry_name: str = "wikipedia_html"
107
+
108
+ @property
109
+ def filename(self) -> Path:
110
+ return (
111
+ Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.html"
112
+ ).resolve()
113
+
114
+ @property
115
+ def text(self):
116
+ return self._get_html()
117
+
118
+ @SourceConnectionNetworkError.wrap
119
+ def _get_html(self):
120
+ return self.page.html()
121
+
122
+ @property
123
+ def _output_filename(self):
124
+ return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-html.json"
125
+
126
+
127
+ @dataclass
128
+ class WikipediaIngestTextDoc(WikipediaIngestDoc):
129
+ registry_name: str = "wikipedia_text"
130
+
131
+ @property
132
+ def filename(self) -> Path:
133
+ return (Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.txt").resolve()
134
+
135
+ @property
136
+ def text(self):
137
+ return self._get_content()
138
+
139
+ @SourceConnectionNetworkError.wrap
140
+ def _get_content(self):
141
+ return self.page.content
142
+
143
+ @property
144
+ def _output_filename(self):
145
+ return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-txt.json"
146
+
147
+
148
+ @dataclass
149
+ class WikipediaIngestSummaryDoc(WikipediaIngestDoc):
150
+ registry_name: str = "wikipedia_summary"
151
+
152
+ @property
153
+ def filename(self) -> Path:
154
+ return (
155
+ Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}-summary.txt"
156
+ ).resolve()
157
+
158
+ @property
159
+ def text(self):
160
+ return self._get_summary()
161
+
162
+ @SourceConnectionNetworkError.wrap
163
+ def _get_summary(self):
164
+ return self.page.summary
165
+
166
+ @property
167
+ def _output_filename(self):
168
+ return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-summary.json"
169
+
170
+
171
+ @dataclass
172
+ class WikipediaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
173
+ connector_config: SimpleWikipediaConfig
174
+
175
+ def initialize(self):
176
+ pass
177
+
178
+ @requires_dependencies(["wikipedia"], extras="wikipedia")
179
+ def check_connection(self):
180
+ import wikipedia
181
+
182
+ try:
183
+ wikipedia.page(
184
+ self.connector_config.page_title,
185
+ auto_suggest=self.connector_config.auto_suggest,
186
+ )
187
+ except Exception as e:
188
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
189
+ raise SourceConnectionError(f"failed to validate connection: {e}")
190
+
191
+ def get_ingest_docs(self):
192
+ return [
193
+ WikipediaIngestTextDoc(
194
+ processor_config=self.processor_config,
195
+ connector_config=self.connector_config,
196
+ read_config=self.read_config,
197
+ ),
198
+ WikipediaIngestHTMLDoc(
199
+ processor_config=self.processor_config,
200
+ connector_config=self.connector_config,
201
+ read_config=self.read_config,
202
+ ),
203
+ WikipediaIngestSummaryDoc(
204
+ processor_config=self.processor_config,
205
+ connector_config=self.connector_config,
206
+ read_config=self.read_config,
207
+ ),
208
+ ]
File without changes
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field
5
+
6
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
14
+ api_version: str = Field(description="Azure API version", default="2024-06-01")
15
+ azure_endpoint: str
16
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
+
18
+ @requires_dependencies(["openai"], extras="openai")
19
+ def get_client(self) -> "AzureOpenAI":
20
+ from openai import AzureOpenAI
21
+
22
+ return AzureOpenAI(
23
+ api_key=self.api_key.get_secret_value(),
24
+ api_version=self.api_version,
25
+ azure_endpoint=self.azure_endpoint,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class AzureOpenAIEmbeddingEncoder(OpenAIEmbeddingEncoder):
31
+ config: AzureOpenAIEmbeddingConfig
@@ -0,0 +1,193 @@
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from contextlib import asynccontextmanager
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, AsyncIterable
7
+
8
+ from pydantic import Field, SecretStr
9
+
10
+ from unstructured_ingest.embed.interfaces import (
11
+ AsyncBaseEmbeddingEncoder,
12
+ BaseEmbeddingEncoder,
13
+ EmbeddingConfig,
14
+ )
15
+ from unstructured_ingest.logger import logger
16
+ from unstructured_ingest.utils.dep_check import requires_dependencies
17
+ from unstructured_ingest.v2.errors import ProviderError, RateLimitError, UserAuthError, UserError
18
+
19
+ if TYPE_CHECKING:
20
+ from botocore.client import BaseClient
21
+
22
+ class BedrockClient(BaseClient):
23
+ def invoke_model(self, body: str, modelId: str, accept: str, contentType: str) -> dict:
24
+ pass
25
+
26
+ class AsyncBedrockClient(BaseClient):
27
+ async def invoke_model(
28
+ self, body: str, modelId: str, accept: str, contentType: str
29
+ ) -> dict:
30
+ pass
31
+
32
+
33
+ def conform_query(query: str, provider: str) -> dict:
34
+ # replace newlines, which can negatively affect performance.
35
+ text = query.replace(os.linesep, " ")
36
+
37
+ # format input body for provider
38
+ input_body = {}
39
+ if provider == "cohere":
40
+ if "input_type" not in input_body:
41
+ input_body["input_type"] = "search_document"
42
+ input_body["texts"] = [text]
43
+ else:
44
+ # includes common provider == "amazon"
45
+ input_body["inputText"] = text
46
+ return input_body
47
+
48
+
49
+ class BedrockEmbeddingConfig(EmbeddingConfig):
50
+ aws_access_key_id: SecretStr
51
+ aws_secret_access_key: SecretStr
52
+ region_name: str = "us-west-2"
53
+ embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
54
+
55
+ def wrap_error(self, e: Exception) -> Exception:
56
+ from botocore.exceptions import ClientError
57
+
58
+ if isinstance(e, ClientError):
59
+ # https://docs.aws.amazon.com/awssupport/latest/APIReference/CommonErrors.html
60
+ http_response = e.response
61
+ meta = http_response["ResponseMetadata"]
62
+ http_response_code = meta["HTTPStatusCode"]
63
+ error_code = http_response["Error"]["Code"]
64
+ if http_response_code == 400:
65
+ if error_code == "ValidationError":
66
+ return UserError(http_response["Error"])
67
+ elif error_code == "ThrottlingException":
68
+ return RateLimitError(http_response["Error"])
69
+ elif error_code == "NotAuthorized" or error_code == "AccessDeniedException":
70
+ return UserAuthError(http_response["Error"])
71
+ if http_response_code == 403:
72
+ return UserAuthError(http_response["Error"])
73
+ if 400 <= http_response_code < 500:
74
+ return UserError(http_response["Error"])
75
+ if http_response_code >= 500:
76
+ return ProviderError(http_response["Error"])
77
+
78
+ logger.error(f"unhandled exception from bedrock: {e}", exc_info=True)
79
+ return e
80
+
81
+ @requires_dependencies(
82
+ ["boto3", "numpy", "botocore"],
83
+ extras="bedrock",
84
+ )
85
+ def get_client(self) -> "BedrockClient":
86
+ import boto3
87
+
88
+ bedrock_client = boto3.client(
89
+ service_name="bedrock-runtime",
90
+ aws_access_key_id=self.aws_access_key_id.get_secret_value(),
91
+ aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
92
+ region_name=self.region_name,
93
+ )
94
+
95
+ return bedrock_client
96
+
97
+ @requires_dependencies(
98
+ ["aioboto3"],
99
+ extras="bedrock",
100
+ )
101
+ @asynccontextmanager
102
+ async def get_async_client(self) -> AsyncIterable["AsyncBedrockClient"]:
103
+ import aioboto3
104
+
105
+ session = aioboto3.Session()
106
+ async with session.client(
107
+ "bedrock-runtime",
108
+ aws_access_key_id=self.aws_access_key_id.get_secret_value(),
109
+ aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
110
+ region_name=self.region_name,
111
+ ) as aws_bedrock:
112
+ yield aws_bedrock
113
+
114
+
115
+ @dataclass
116
+ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
117
+ config: BedrockEmbeddingConfig
118
+
119
+ def wrap_error(self, e: Exception) -> Exception:
120
+ return self.config.wrap_error(e=e)
121
+
122
+ def embed_query(self, query: str) -> list[float]:
123
+ """Call out to Bedrock embedding endpoint."""
124
+ provider = self.config.embed_model_name.split(".")[0]
125
+ body = conform_query(query=query, provider=provider)
126
+
127
+ bedrock_client = self.config.get_client()
128
+ # invoke bedrock API
129
+ try:
130
+ response = bedrock_client.invoke_model(
131
+ body=json.dumps(body),
132
+ modelId=self.config.embed_model_name,
133
+ accept="application/json",
134
+ contentType="application/json",
135
+ )
136
+ except Exception as e:
137
+ raise self.wrap_error(e=e)
138
+
139
+ # format output based on provider
140
+ response_body = json.loads(response.get("body").read())
141
+ if provider == "cohere":
142
+ return response_body.get("embeddings")[0]
143
+ else:
144
+ # includes common provider == "amazon"
145
+ return response_body.get("embedding")
146
+
147
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
148
+ embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
149
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
150
+ return elements_with_embeddings
151
+
152
+
153
+ @dataclass
154
+ class AsyncBedrockEmbeddingEncoder(AsyncBaseEmbeddingEncoder):
155
+ config: BedrockEmbeddingConfig
156
+
157
+ def wrap_error(self, e: Exception) -> Exception:
158
+ return self.config.wrap_error(e=e)
159
+
160
+ async def embed_query(self, query: str) -> list[float]:
161
+ """Call out to Bedrock embedding endpoint."""
162
+ provider = self.config.embed_model_name.split(".")[0]
163
+ body = conform_query(query=query, provider=provider)
164
+ try:
165
+ async with self.config.get_async_client() as bedrock_client:
166
+ # invoke bedrock API
167
+ try:
168
+ response = await bedrock_client.invoke_model(
169
+ body=json.dumps(body),
170
+ modelId=self.config.embed_model_name,
171
+ accept="application/json",
172
+ contentType="application/json",
173
+ )
174
+ except Exception as e:
175
+ raise self.wrap_error(e=e)
176
+ async with response.get("body") as client_response:
177
+ response_body = await client_response.json()
178
+
179
+ # format output based on provider
180
+ if provider == "cohere":
181
+ return response_body.get("embeddings")[0]
182
+ else:
183
+ # includes common provider == "amazon"
184
+ return response_body.get("embedding")
185
+ except Exception as e:
186
+ raise ValueError(f"Error raised by inference endpoint: {e}")
187
+
188
+ async def embed_documents(self, elements: list[dict]) -> list[dict]:
189
+ embeddings = await asyncio.gather(
190
+ *[self.embed_query(query=e.get("text", "")) for e in elements]
191
+ )
192
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
193
+ return elements_with_embeddings
@@ -0,0 +1,52 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from pydantic import Field
5
+
6
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+
13
+ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
14
+ embedder_model_name: Optional[str] = Field(
15
+ default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
16
+ )
17
+ embedder_model_kwargs: Optional[dict] = Field(
18
+ default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
19
+ )
20
+ encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
21
+ cache_folder: Optional[str] = Field(default=None)
22
+
23
+ @requires_dependencies(
24
+ ["sentence_transformers"],
25
+ extras="embed-huggingface",
26
+ )
27
+ def get_client(self) -> "SentenceTransformer":
28
+ from sentence_transformers import SentenceTransformer
29
+
30
+ return SentenceTransformer(
31
+ model_name_or_path=self.embedder_model_name,
32
+ cache_folder=self.cache_folder,
33
+ **self.embedder_model_kwargs,
34
+ )
35
+
36
+
37
+ @dataclass
38
+ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
39
+ config: HuggingFaceEmbeddingConfig
40
+
41
+ def embed_query(self, query: str) -> list[float]:
42
+ return self._embed_documents(texts=[query])[0]
43
+
44
+ def _embed_documents(self, texts: list[str]) -> list[list[float]]:
45
+ client = self.config.get_client()
46
+ embeddings = client.encode(texts, **self.config.encode_kwargs)
47
+ return embeddings.tolist()
48
+
49
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
50
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
51
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
52
+ return elements_with_embeddings