unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,75 @@
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from test.integration.utils import requires_env
8
+ from unstructured_ingest.v2.errors import UserError
9
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
10
+
11
+ int_test_dir = Path(__file__).parent
12
+ assets_dir = int_test_dir / "assets"
13
+
14
+ all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
15
+ non_image_partition_files = [
16
+ path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
17
+ ]
18
+ image_partition_files = [
19
+ path for path in all_partition_files if path not in non_image_partition_files
20
+ ]
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
25
+ )
26
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
27
+ @pytest.mark.asyncio
28
+ async def test_partitioner_api_hi_res(partition_file: Path):
29
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
30
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
31
+ partitioner_config = PartitionerConfig(
32
+ strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
33
+ )
34
+ partitioner = Partitioner(config=partitioner_config)
35
+ results = await partitioner.run_async(filename=partition_file)
36
+ results_dir = int_test_dir / "results"
37
+ results_dir.mkdir(exist_ok=True)
38
+ results_path = results_dir / f"{partition_file.name}.json"
39
+ with results_path.open("w") as f:
40
+ json.dump(results, f, indent=2)
41
+ assert results
42
+
43
+
44
+ @pytest.mark.parametrize(
45
+ "partition_file",
46
+ non_image_partition_files,
47
+ ids=[path.name for path in non_image_partition_files],
48
+ )
49
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
50
+ @pytest.mark.asyncio
51
+ async def test_partitioner_api_fast(partition_file: Path):
52
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
53
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
54
+ partitioner_config = PartitionerConfig(
55
+ strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
56
+ )
57
+ partitioner = Partitioner(config=partitioner_config)
58
+ results = await partitioner.run_async(filename=partition_file)
59
+ assert results
60
+
61
+
62
+ @pytest.mark.parametrize(
63
+ "partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
64
+ )
65
+ @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
66
+ @pytest.mark.asyncio
67
+ async def test_partitioner_api_fast_error(partition_file: Path):
68
+ api_key = os.getenv("UNSTRUCTURED_API_KEY")
69
+ api_url = os.getenv("UNSTRUCTURED_API_URL")
70
+ partitioner_config = PartitionerConfig(
71
+ strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
72
+ )
73
+ partitioner = Partitioner(config=partitioner_config)
74
+ with pytest.raises(UserError):
75
+ await partitioner.run_async(filename=partition_file)
@@ -0,0 +1,15 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+
6
+ def requires_env(*envs):
7
+ if len(envs) == 1:
8
+ env = envs[0]
9
+ return pytest.mark.skipif(
10
+ env not in os.environ, reason=f"Environment variable not set: {env}"
11
+ )
12
+ return pytest.mark.skipif(
13
+ not all(env in os.environ for env in envs),
14
+ reason="All required environment variables not set: {}".format(", ".join(envs)),
15
+ )
test/unit/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,42 @@
1
+ from unstructured_ingest.embed.mixedbreadai import (
2
+ MixedbreadAIEmbeddingConfig,
3
+ MixedbreadAIEmbeddingEncoder,
4
+ )
5
+
6
+
7
+ def test_embed_documents_does_not_break_element_to_dict(mocker):
8
+ mock_client = mocker.MagicMock()
9
+
10
+ def mock_embeddings(
11
+ model,
12
+ normalized,
13
+ encoding_format,
14
+ truncation_strategy,
15
+ request_options,
16
+ input,
17
+ ):
18
+ mock_response = mocker.MagicMock()
19
+ mock_response.data = [mocker.MagicMock(embedding=[i, i + 1]) for i in range(len(input))]
20
+ return mock_response
21
+
22
+ mock_client.embeddings.side_effect = mock_embeddings
23
+
24
+ # Mock get_client to return our mock_client
25
+ mocker.patch.object(MixedbreadAIEmbeddingConfig, "get_client", return_value=mock_client)
26
+ mocker.patch.object(MixedbreadAIEmbeddingEncoder, "get_request_options", return_value={})
27
+
28
+ encoder = MixedbreadAIEmbeddingEncoder(
29
+ config=MixedbreadAIEmbeddingConfig(
30
+ api_key="api_key", model_name="mixedbread-ai/mxbai-embed-large-v1"
31
+ )
32
+ )
33
+
34
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
35
+ elements = encoder.embed_documents(
36
+ elements=raw_elements,
37
+ )
38
+ assert len(elements) == 2
39
+ assert elements[0]["text"] == "This is sentence 1"
40
+ assert elements[1]["text"] == "This is sentence 2"
41
+ assert elements[0]["embeddings"] is not None
42
+ assert elements[1]["embeddings"] is not None
@@ -0,0 +1,27 @@
1
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
2
+
3
+
4
+ def test_embed_documents_does_not_break_element_to_dict(mocker):
5
+ # Mocked client with the desired behavior for embed_documents
6
+ mock_client = mocker.MagicMock()
7
+ mock_data = []
8
+ for i in range(2):
9
+ data = mocker.MagicMock()
10
+ data.embedding = [1, 2]
11
+ mock_data.append(data)
12
+ mock_response = mocker.MagicMock()
13
+ mock_response.data = mock_data
14
+ mock_client.embeddings.create.return_value = mock_response
15
+
16
+ # Mock get_client to return our mock_client
17
+ mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client)
18
+
19
+ encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key"))
20
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
21
+
22
+ elements = encoder.embed_documents(
23
+ elements=raw_elements,
24
+ )
25
+ assert len(elements) == 2
26
+ assert elements[0]["text"] == "This is sentence 1"
27
+ assert elements[1]["text"] == "This is sentence 2"
@@ -0,0 +1,20 @@
1
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
2
+
3
+
4
+ def test_embed_documents_does_not_break_element_to_dict(mocker):
5
+ # Mocked client with the desired behavior for embed_documents
6
+ mock_client = mocker.MagicMock()
7
+ mock_client.embed_documents.return_value = [1, 2]
8
+
9
+ # Mock get_client to return our mock_client
10
+ mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client)
11
+
12
+ encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key"))
13
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
14
+
15
+ elements = encoder.embed_documents(
16
+ elements=raw_elements,
17
+ )
18
+ assert len(elements) == 2
19
+ assert elements[0]["text"] == "This is sentence 1"
20
+ assert elements[1]["text"] == "This is sentence 2"
@@ -0,0 +1,25 @@
1
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
2
+
3
+
4
+ def test_embed_documents_does_not_break_element_to_dict(mocker):
5
+ # Mocked client with the desired behavior for embed_documents
6
+ mock_responses = []
7
+ for i in [1, 2]:
8
+ mock_response = mocker.Mock()
9
+ mocker.patch.object(mock_response, "values", i)
10
+ mock_responses.append(mock_response)
11
+
12
+ mock_client = mocker.MagicMock()
13
+ mock_client.get_embeddings.return_value = mock_responses
14
+
15
+ # Mock create_client to return our mock_client
16
+ mocker.patch.object(VertexAIEmbeddingConfig, "get_client", return_value=mock_client)
17
+ encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key={"api_key": "value"}))
18
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
19
+
20
+ elements = encoder.embed_documents(
21
+ elements=raw_elements,
22
+ )
23
+ assert len(elements) == 2
24
+ assert elements[0]["text"] == "This is sentence 1"
25
+ assert elements[1]["text"] == "This is sentence 2"
@@ -0,0 +1,24 @@
1
+ from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
2
+
3
+
4
+ def test_embed_documents_does_not_break_element_to_dict(mocker):
5
+ # Mocked client with the desired behavior for embed_documents
6
+ mock_response = mocker.MagicMock()
7
+ mocker.patch.object(mock_response, "embeddings", [1, 2])
8
+ mock_client = mocker.MagicMock()
9
+ mock_client.embed.return_value = mock_response
10
+
11
+ # Mock get_client to return our mock_client
12
+ mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client)
13
+
14
+ encoder = VoyageAIEmbeddingEncoder(
15
+ config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
16
+ )
17
+ raw_elements = [{"text": f"This is sentence {i + 1}"} for i in range(2)]
18
+
19
+ elements = encoder.embed_documents(
20
+ elements=raw_elements,
21
+ )
22
+ assert len(elements) == 2
23
+ assert elements[0]["text"] == "This is sentence 1"
24
+ assert elements[1]["text"] == "This is sentence 2"
@@ -0,0 +1,27 @@
1
+ import pytest
2
+
3
+ from unstructured_ingest.error import (
4
+ DestinationConnectionError,
5
+ PartitionError,
6
+ SourceConnectionError,
7
+ )
8
+
9
+
10
+ @pytest.mark.parametrize(
11
+ ("error_class", "exception_type", "error_message"),
12
+ [
13
+ (SourceConnectionError, ValueError, "Simulated connection error"),
14
+ (DestinationConnectionError, RuntimeError, "Simulated connection error"),
15
+ (PartitionError, FileNotFoundError, "Simulated partition error"),
16
+ ],
17
+ )
18
+ def test_custom_error_decorator(error_class, exception_type, error_message):
19
+ @error_class.wrap
20
+ def simulate_error():
21
+ raise exception_type(error_message)
22
+
23
+ with pytest.raises(error_class) as context:
24
+ simulate_error()
25
+
26
+ expected_error_string = error_class.error_string.format(error_message)
27
+ assert str(context.value) == expected_error_string
@@ -0,0 +1,78 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from unstructured_ingest.logger import (
6
+ default_is_data_sensitive,
7
+ hide_sensitive_fields,
8
+ redact_jsons,
9
+ )
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ ("key", "value", "is_sensitive"),
14
+ [
15
+ ("username", "john_smith", False),
16
+ ("password", "13?H%", True),
17
+ ("token", "123", True),
18
+ ("AWS_CREDENTIAL", "aws_credential", True),
19
+ ("AWS_KEY", None, False),
20
+ ],
21
+ )
22
+ def test_default_is_sensitive(key, value, is_sensitive):
23
+ assert default_is_data_sensitive(key, value) == is_sensitive
24
+
25
+
26
+ def test_hide_sensitive_fields():
27
+ d = {
28
+ "username": "john_smith",
29
+ "password": "13?H%",
30
+ "inner": {
31
+ "token": "123",
32
+ "AWS_KEY": None,
33
+ "inner_j_string": json.dumps(
34
+ {"account_name": "secret name", "client_id": 123, "timestamp": 123}
35
+ ),
36
+ },
37
+ }
38
+ redacted_d = hide_sensitive_fields(d)
39
+ expected_d = {
40
+ "password": "*******",
41
+ "username": "john_smith",
42
+ "inner": {
43
+ "token": "*******",
44
+ "AWS_KEY": None,
45
+ "inner_j_string": json.dumps(
46
+ {"account_name": "*******", "client_id": "*******", "timestamp": 123}
47
+ ),
48
+ },
49
+ }
50
+ assert redacted_d == expected_d
51
+
52
+
53
+ def test_redact_jsons():
54
+ d1 = {
55
+ "username": "john_smith",
56
+ "password": "13?H%",
57
+ "inner": {
58
+ "token": "123",
59
+ "AWS_KEY": None,
60
+ "inner_j_string": json.dumps(
61
+ {"account_name": "secret name", "client_id": 123, "timestamp": 123}
62
+ ),
63
+ },
64
+ }
65
+
66
+ d2 = {"username": "tim67", "update_time": 456}
67
+ d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
68
+
69
+ sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
70
+ expected_string = (
71
+ 'Some topic secret info ({"username": "john_smith", "password": "*******", '
72
+ '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
73
+ '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
74
+ '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
75
+ 'and {"account_name": "*******", "host": "http://localhost:8888"})'
76
+ )
77
+ redacted_string = redact_jsons(sensitive_string)
78
+ assert redacted_string == expected_string
@@ -0,0 +1,184 @@
1
+ import json
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+ import pytest
7
+ import pytz
8
+
9
+ from unstructured_ingest.cli.utils import extract_config
10
+ from unstructured_ingest.interfaces import BaseConfig
11
+ from unstructured_ingest.utils.string_and_date_utils import (
12
+ ensure_isoformat_datetime,
13
+ json_to_dict,
14
+ truncate_string_bytes,
15
+ )
16
+
17
+
18
+ @dataclass
19
+ class A(BaseConfig):
20
+ a: str
21
+
22
+
23
+ @dataclass
24
+ class B(BaseConfig):
25
+ a: A
26
+ b: int
27
+
28
+
29
+ flat_data = {"a": "test", "b": 4, "c": True}
30
+
31
+
32
+ def test_extract_config_concrete():
33
+ @dataclass
34
+ class C(BaseConfig):
35
+ b: B
36
+ c: bool
37
+
38
+ c = extract_config(flat_data=flat_data, config=C)
39
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
40
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
41
+
42
+
43
+ def test_extract_config_optional():
44
+ @dataclass
45
+ class C(BaseConfig):
46
+ c: bool
47
+ b: t.Optional[B] = None
48
+
49
+ c = extract_config(flat_data=flat_data, config=C)
50
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
51
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
52
+
53
+
54
+ def test_extract_config_union():
55
+ @dataclass
56
+ class C(BaseConfig):
57
+ c: bool
58
+ b: t.Optional[t.Union[B, int]] = None
59
+
60
+ c = extract_config(flat_data=flat_data, config=C)
61
+ expected_result = {"b": 4, "c": True}
62
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
63
+
64
+
65
+ def test_extract_config_list():
66
+ @dataclass
67
+ class C(BaseConfig):
68
+ c: t.List[int]
69
+ b: B
70
+
71
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
72
+ c = extract_config(flat_data=flat_data, config=C)
73
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
74
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
75
+
76
+
77
+ def test_extract_config_optional_list():
78
+ @dataclass
79
+ class C(BaseConfig):
80
+ b: B
81
+ c: t.Optional[t.List[int]] = None
82
+
83
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
84
+ c = extract_config(flat_data=flat_data, config=C)
85
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
86
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
87
+
88
+
89
+ def test_extract_config_dataclass_list():
90
+ @dataclass
91
+ class C(BaseConfig):
92
+ c: bool
93
+ b: t.List[B] = field(default_factory=list)
94
+
95
+ flat_data = {"a": "test", "c": True}
96
+ c = extract_config(flat_data=flat_data, config=C)
97
+ expected_result = {"b": [], "c": True}
98
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
99
+
100
+
101
+ def test_extract_config_dict():
102
+ @dataclass
103
+ class C(BaseConfig):
104
+ c: bool
105
+ b: t.Dict[str, B] = field(default_factory=dict)
106
+
107
+ flat_data = {"c": True}
108
+ c = extract_config(flat_data=flat_data, config=C)
109
+ expected_result = {"c": True, "b": {}}
110
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
111
+
112
+
113
+ def test_json_to_dict_valid_json():
114
+ json_string = '{"key": "value"}'
115
+ expected_result = {"key": "value"}
116
+ assert json_to_dict(json_string) == expected_result
117
+ assert isinstance(json_to_dict(json_string), dict)
118
+
119
+
120
+ def test_json_to_dict_malformed_json():
121
+ json_string = '{"key": "value"'
122
+ expected_result = '{"key": "value"'
123
+ assert json_to_dict(json_string) == expected_result
124
+ assert isinstance(json_to_dict(json_string), str)
125
+
126
+
127
+ def test_json_to_dict_single_quotes():
128
+ json_string = "{'key': 'value'}"
129
+ expected_result = {"key": "value"}
130
+ assert json_to_dict(json_string) == expected_result
131
+ assert isinstance(json_to_dict(json_string), dict)
132
+
133
+
134
+ def test_json_to_dict_path():
135
+ json_string = "/path/to/file.json"
136
+ expected_result = "/path/to/file.json"
137
+ assert json_to_dict(json_string) == expected_result
138
+ assert isinstance(json_to_dict(json_string), str)
139
+
140
+
141
+ def test_ensure_isoformat_datetime_for_datetime():
142
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
143
+ assert dt == "2021-01-01T12:00:00"
144
+
145
+
146
+ def test_ensure_isoformat_datetime_for_datetime_with_tz():
147
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
148
+ assert dt == "2021-01-01T12:00:00+00:00"
149
+
150
+
151
+ def test_ensure_isoformat_datetime_for_string():
152
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
153
+ assert dt == "2021-01-01T12:00:00"
154
+
155
+
156
+ def test_ensure_isoformat_datetime_for_string2():
157
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
158
+ assert dt == "2021-01-01T12:00:00+00:00"
159
+
160
+
161
+ def test_ensure_isoformat_datetime_fails_on_string():
162
+ with pytest.raises(ValueError):
163
+ ensure_isoformat_datetime("bad timestamp")
164
+
165
+
166
+ def test_ensure_isoformat_datetime_fails_on_int():
167
+ with pytest.raises(TypeError):
168
+ ensure_isoformat_datetime(1111)
169
+
170
+
171
+ def test_truncate_string_bytes_return_truncated_string():
172
+ test_string = "abcdef안녕하세요ghijklmn방갑습니opqrstu 더 길어지면 안되는 문자열vwxyz"
173
+ max_bytes = 11
174
+ result = truncate_string_bytes(test_string, max_bytes)
175
+ assert result == "abcdef안"
176
+ assert len(result.encode("utf-8")) <= max_bytes
177
+
178
+
179
+ def test_truncate_string_bytes_return_untouched_string():
180
+ test_string = "abcdef"
181
+ max_bytes = 11
182
+ result = truncate_string_bytes(test_string, max_bytes)
183
+ assert result == "abcdef"
184
+ assert len(result.encode("utf-8")) <= max_bytes
File without changes
File without changes
@@ -0,0 +1,49 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_chunker_config_params() -> dict:
12
+ params = {}
13
+ random_val = random.random()
14
+ if random_val < 0.5:
15
+ params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
16
+ params["chunk_combine_text_under_n_chars"] = (
17
+ fake.random_int() if random.random() < 0.5 else None
18
+ )
19
+ params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
20
+ params["chunk_max_characters"] = fake.random_int()
21
+ params["chunk_multipage_sections"] = fake.boolean()
22
+ params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
23
+ params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
24
+ params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
25
+ if random_val < 0.5:
26
+ params["chunk_by_api"] = True
27
+ params["chunking_endpoint"] = fake.url()
28
+ params["chunk_api_key"] = fake.password()
29
+ else:
30
+ params["chunk_by_api"] = False
31
+
32
+ return params
33
+
34
+
35
+ @pytest.mark.parametrize(
36
+ "partition_config_params", [generate_chunker_config_params() for i in range(10)]
37
+ )
38
+ def test_chunker_config(partition_config_params: dict):
39
+ chunker_config = ChunkerConfig.model_validate(partition_config_params)
40
+ assert chunker_config
41
+
42
+
43
+ @pytest.mark.parametrize(
44
+ "partition_config_params", [generate_chunker_config_params() for i in range(10)]
45
+ )
46
+ def test_chunker(partition_config_params: dict):
47
+ chunker_config = ChunkerConfig.model_validate(partition_config_params)
48
+ chunker = Chunker(config=chunker_config)
49
+ assert chunker
File without changes
@@ -0,0 +1,39 @@
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from unstructured_ingest.v2.processes.connectors.confluence import (
5
+ ConfluenceAccessConfig,
6
+ ConfluenceConnectionConfig,
7
+ )
8
+
9
+
10
+ def test_connection_config_multiple_auth():
11
+ with pytest.raises(ValidationError):
12
+ ConfluenceConnectionConfig(
13
+ access_config=ConfluenceAccessConfig(
14
+ api_token="api_token",
15
+ access_token="access_token",
16
+ ),
17
+ user_email="user_email",
18
+ url="url",
19
+ )
20
+
21
+
22
+ def test_connection_config_no_auth():
23
+ with pytest.raises(ValidationError):
24
+ ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
25
+
26
+
27
+ def test_connection_config_basic_auth():
28
+ ConfluenceConnectionConfig(
29
+ access_config=ConfluenceAccessConfig(api_token="api_token"),
30
+ url="url",
31
+ user_email="user_email",
32
+ )
33
+
34
+
35
+ def test_connection_config_pat_auth():
36
+ ConfluenceConnectionConfig(
37
+ access_config=ConfluenceAccessConfig(access_token="access_token"),
38
+ url="url",
39
+ )
File without changes
@@ -0,0 +1,36 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "aws_access_key_id": fake.password(),
14
+ "aws_secret_access_key": fake.password(),
15
+ "region_name": fake.city(),
16
+ }
17
+ if random.random() < 0.5:
18
+ params["embed_model_name"] = fake.word()
19
+ return params
20
+
21
+
22
+ @pytest.mark.parametrize(
23
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
24
+ )
25
+ def test_embedder_config(embedder_config_params: dict):
26
+ embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
27
+ assert embedder_config
28
+
29
+
30
+ @pytest.mark.parametrize(
31
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
32
+ )
33
+ def test_embedder(embedder_config_params: dict):
34
+ embedder_config = BedrockEmbeddingConfig.model_validate(embedder_config_params)
35
+ embedder = BedrockEmbeddingEncoder(config=embedder_config)
36
+ assert embedder