unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,48 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ import faker
5
+ import pytest
6
+
7
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
8
+ from unstructured_ingest.embed.huggingface import (
9
+ HuggingFaceEmbeddingConfig,
10
+ HuggingFaceEmbeddingEncoder,
11
+ )
12
+
13
+ fake = faker.Faker()
14
+
15
+
16
+ def generate_embedder_config_params() -> dict:
17
+ params = {}
18
+ if random.random() < 0.5:
19
+ params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
20
+ params["embedder_model_kwargs"] = (
21
+ generate_random_dictionary(key_type=str, value_type=Any)
22
+ if random.random() < 0.5
23
+ else None
24
+ )
25
+ params["encode_kwargs"] = (
26
+ generate_random_dictionary(key_type=str, value_type=Any)
27
+ if random.random() < 0.5
28
+ else None
29
+ )
30
+ params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
31
+ return params
32
+
33
+
34
+ @pytest.mark.parametrize(
35
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
36
+ )
37
+ def test_embedder_config(embedder_config_params: dict):
38
+ embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
39
+ assert embedder_config
40
+
41
+
42
+ @pytest.mark.parametrize(
43
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
44
+ )
45
+ def test_embedder(embedder_config_params: dict):
46
+ embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
47
+ embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
48
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.mixedbreadai import (
7
+ MixedbreadAIEmbeddingConfig,
8
+ MixedbreadAIEmbeddingEncoder,
9
+ )
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": fake.password(),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,35 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["base_url"] = fake.url()
18
+ return params
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
+ )
24
+ def test_embedder_config(embedder_config_params: dict):
25
+ embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
26
+ assert embedder_config
27
+
28
+
29
+ @pytest.mark.parametrize(
30
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
+ )
32
+ def test_embedder(embedder_config_params: dict):
33
+ embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
34
+ embedder = OctoAIEmbeddingEncoder(config=embedder_config)
35
+ assert embedder
@@ -0,0 +1,35 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["base_url"] = fake.url()
18
+ return params
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
+ )
24
+ def test_embedder_config(embedder_config_params: dict):
25
+ embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
26
+ assert embedder_config
27
+
28
+
29
+ @pytest.mark.parametrize(
30
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
+ )
32
+ def test_embedder(embedder_config_params: dict):
33
+ embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
34
+ embedder = OpenAIEmbeddingEncoder(config=embedder_config)
35
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.togetherai import (
7
+ TogetherAIEmbeddingConfig,
8
+ TogetherAIEmbeddingEncoder,
9
+ )
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": fake.password(),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import json
2
+ import random
3
+ from typing import Any
4
+
5
+ import faker
6
+ import pytest
7
+
8
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
9
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = VertexAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,38 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["batch_size"] = fake.random_int()
18
+ params["truncation"] = fake.boolean()
19
+ params["max_retries"] = fake.random_int()
20
+ params["timeout_in_seconds"] = fake.random_int()
21
+ return params
22
+
23
+
24
+ @pytest.mark.parametrize(
25
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
26
+ )
27
+ def test_embedder_config(embedder_config_params: dict):
28
+ embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
29
+ assert embedder_config
30
+
31
+
32
+ @pytest.mark.parametrize(
33
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
34
+ )
35
+ def test_embedder(embedder_config_params: dict):
36
+ embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
37
+ embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
38
+ assert embedder
File without changes
@@ -0,0 +1,63 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ import faker
5
+ import pytest
6
+
7
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
8
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
9
+
10
+ fake = faker.Faker()
11
+
12
+
13
+ def generate_partitioner_config_params() -> dict:
14
+ params = {
15
+ "strategy": random.choice(["fast", "hi_res", "auto"]),
16
+ "ocr_languages": fake.words() if random.random() < 0.5 else None,
17
+ "encoding": fake.word() if random.random() < 0.5 else None,
18
+ "additional_partition_args": (
19
+ generate_random_dictionary(key_type=str, value_type=Any)
20
+ if random.random() < 0.5
21
+ else None
22
+ ),
23
+ "skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
24
+ "flatten_metadata": fake.boolean(),
25
+ "hi_res_model_name": fake.word() if random.random() < 0.5 else None,
26
+ }
27
+ random_val = random.random()
28
+ # Randomly set the fields_include to a random list[str]
29
+ if random_val < 0.5:
30
+ params["fields_include"] = fake.words()
31
+
32
+ # Randomly set the metadata_exclude or metadata_include to a valid
33
+ # list[str] or don't set it at all
34
+ if random.random() < (1 / 3):
35
+ params["metadata_exclude"] = fake.words()
36
+ elif random_val < (2 / 3):
37
+ params["metadata_include"] = fake.words()
38
+
39
+ # Randomly set the values associated with calling the api, or not at all
40
+ if random.random() < 0.5:
41
+ params["partition_by_api"]: True
42
+ params["partition_endpoint"] = fake.url()
43
+ params["api_key"] = fake.password()
44
+ else:
45
+ params["partition_by_api"]: False
46
+ return params
47
+
48
+
49
+ @pytest.mark.parametrize(
50
+ "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
51
+ )
52
+ def test_partition_config(partition_config_params: dict):
53
+ partition_config = PartitionerConfig.model_validate(partition_config_params)
54
+ assert partition_config
55
+
56
+
57
+ @pytest.mark.parametrize(
58
+ "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
59
+ )
60
+ def test_partitioner(partition_config_params: dict):
61
+ partition_config = PartitionerConfig.model_validate(partition_config_params)
62
+ partitioner = Partitioner(config=partition_config)
63
+ assert partitioner
@@ -0,0 +1,26 @@
1
+ import pytest
2
+ from pydantic import Secret, ValidationError
3
+
4
+ from unstructured_ingest.v2.interfaces import AccessConfig, ConnectionConfig
5
+
6
+
7
+ def test_failing_connection_config():
8
+ class MyAccessConfig(AccessConfig):
9
+ sensitive_value: str
10
+
11
+ class MyConnectionConfig(ConnectionConfig):
12
+ access_config: MyAccessConfig
13
+
14
+ with pytest.raises(ValidationError):
15
+ MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
16
+
17
+
18
+ def test_happy_path_connection_config():
19
+ class MyAccessConfig(AccessConfig):
20
+ sensitive_value: str
21
+
22
+ class MyConnectionConfig(ConnectionConfig):
23
+ access_config: Secret[MyAccessConfig]
24
+
25
+ connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
26
+ assert connection_config
@@ -0,0 +1,82 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field, Secret, SecretStr
5
+ from pydantic.types import _SecretBase
6
+
7
+ from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
8
+
9
+
10
+ class MockChildBaseModel(BaseModel):
11
+ child_secret_str: SecretStr
12
+ child_secret_float: Secret[float]
13
+ child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
14
+
15
+
16
+ class MockBaseModel(BaseModel):
17
+ secret_str: SecretStr
18
+ not_secret_bool: bool
19
+ secret_child_base: Secret[MockChildBaseModel]
20
+ not_secret_list: list[int] = Field(default_factory=list)
21
+
22
+
23
+ model = MockBaseModel(
24
+ secret_str="secret string",
25
+ not_secret_bool=False,
26
+ secret_child_base=MockChildBaseModel(
27
+ child_secret_str="child secret string",
28
+ child_secret_float=3.14,
29
+ child_not_secret_dict={"key": "value"},
30
+ ),
31
+ not_secret_list=[1, 2, 3],
32
+ )
33
+
34
+
35
+ def test_serialize_base_model():
36
+
37
+ serialized_dict = model.model_dump()
38
+ assert isinstance(serialized_dict["secret_str"], _SecretBase)
39
+ assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
40
+
41
+ serialized_dict_w_secrets = serialize_base_model(model=model)
42
+ assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
43
+ assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
44
+
45
+ expected_dict = {
46
+ "secret_str": "secret string",
47
+ "not_secret_bool": False,
48
+ "secret_child_base": {
49
+ "child_secret_str": "child secret string",
50
+ "child_secret_float": 3.14,
51
+ "child_not_secret_dict": {"key": "value"},
52
+ },
53
+ "not_secret_list": [1, 2, 3],
54
+ }
55
+
56
+ assert serialized_dict_w_secrets == expected_dict
57
+
58
+
59
+ def test_serialize_base_model_json():
60
+ serialized_json = model.model_dump_json()
61
+ serialized_dict = json.loads(serialized_json)
62
+ expected_dict = {
63
+ "secret_str": "**********",
64
+ "not_secret_bool": False,
65
+ "secret_child_base": "**********",
66
+ "not_secret_list": [1, 2, 3],
67
+ }
68
+ assert expected_dict == serialized_dict
69
+
70
+ serialized_json_w_secrets = serialize_base_model_json(model=model)
71
+ serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
72
+ expected_dict_w_secrets = {
73
+ "secret_str": "secret string",
74
+ "not_secret_bool": False,
75
+ "secret_child_base": {
76
+ "child_secret_str": "child secret string",
77
+ "child_secret_float": 3.14,
78
+ "child_not_secret_dict": {"key": "value"},
79
+ },
80
+ "not_secret_list": [1, 2, 3],
81
+ }
82
+ assert expected_dict_w_secrets == serialized_dict_w_secrets
File without changes
@@ -0,0 +1,32 @@
1
+ import random
2
+ from typing import Any, Type
3
+
4
+ from faker import Faker
5
+
6
+ fake = Faker()
7
+
8
+ type_to_random_value_map = {
9
+ str: fake.sentence,
10
+ int: fake.random_int,
11
+ float: fake.random_digit,
12
+ bool: fake.boolean,
13
+ }
14
+ type_to_random_value_map_key = type_to_random_value_map.copy()
15
+ type_to_random_value_map_key[str] = fake.word
16
+
17
+
18
+ def generate_random_dictionary(key_type: Type = str, value_type: Type = str) -> dict:
19
+ d = {}
20
+ num_keys = random.randint(1, 3)
21
+ for i in range(num_keys):
22
+ key = type_to_random_value_map_key[key_type]()
23
+ current_value_type = value_type
24
+ if current_value_type == Any:
25
+ current_value_type = random.choice(list(type_to_random_value_map.keys()) + [dict])
26
+ value = (
27
+ generate_random_dictionary(key_type=key_type, value_type=value_type)
28
+ if current_value_type is dict
29
+ else type_to_random_value_map[current_value_type]()
30
+ )
31
+ d[key] = value
32
+ return d
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -0,0 +1 @@
1
+ __version__ = "0.3.13" # pragma: no cover
@@ -0,0 +1,14 @@
1
+ import typing as t
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.cli.cmds import base_dest_cmd_fns, base_src_cmd_fns
6
+
7
+ src: t.List[click.Group] = [v().get_src_cmd() for v in base_src_cmd_fns]
8
+
9
+ dest: t.List[click.Command] = [v().get_dest_cmd() for v in base_dest_cmd_fns]
10
+
11
+ __all__ = [
12
+ "src",
13
+ "dest",
14
+ ]
File without changes
@@ -0,0 +1,19 @@
1
+ import typing as t
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+
5
+ from unstructured_ingest.cli.interfaces import CliConfig
6
+ from unstructured_ingest.interfaces import BaseConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseCmd(ABC):
11
+ cmd_name: str
12
+ cli_config: t.Optional[t.Type[BaseConfig]] = None
13
+ additional_cli_options: t.List[t.Type[CliConfig]] = field(default_factory=list)
14
+ addition_configs: t.Dict[str, t.Type[BaseConfig]] = field(default_factory=dict)
15
+ is_fsspec: bool = False
16
+
17
+ @property
18
+ def cmd_name_key(self):
19
+ return self.cmd_name.replace("-", "_")
@@ -0,0 +1,87 @@
1
+ import logging
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ import click
6
+
7
+ from unstructured_ingest.cli.base.cmd import BaseCmd
8
+ from unstructured_ingest.cli.cmd_factory import get_src_cmd
9
+ from unstructured_ingest.cli.common import (
10
+ log_options,
11
+ )
12
+ from unstructured_ingest.cli.interfaces import BaseConfig, CliFilesStorageConfig
13
+ from unstructured_ingest.cli.utils import (
14
+ add_options,
15
+ conform_click_options,
16
+ extract_config,
17
+ extract_configs,
18
+ )
19
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
20
+ from unstructured_ingest.runner.writers import writer_map
21
+
22
+
23
+ @dataclass
24
+ class BaseDestCmd(BaseCmd):
25
+ write_config: t.Optional[t.Type[BaseConfig]] = None
26
+
27
+ def get_dest_runner(self, source_cmd: str, options: dict, parent_options: dict):
28
+ src_cmd_fn = get_src_cmd(cmd_name=source_cmd)
29
+ src_cmd = src_cmd_fn()
30
+ runner = src_cmd.get_source_runner(options=parent_options)
31
+ addition_configs = self.addition_configs
32
+ if "connector_config" not in addition_configs:
33
+ addition_configs["connector_config"] = self.cli_config
34
+ if self.write_config:
35
+ addition_configs["write_config"] = self.write_config
36
+ configs = extract_configs(
37
+ options,
38
+ validate=[self.cli_config] if self.cli_config else None,
39
+ extras=addition_configs,
40
+ add_defaults=False,
41
+ )
42
+ writer_cls = writer_map[self.cmd_name_key]
43
+ writer = writer_cls(**configs) # type: ignore
44
+ runner.writer = writer
45
+ runner.writer_kwargs = options
46
+ return runner
47
+
48
+ def check_dest_options(self, options: dict):
49
+ extract_config(flat_data=options, config=self.cli_config)
50
+
51
+ def dest(self, ctx: click.Context, **options):
52
+ if not ctx.parent:
53
+ raise click.ClickException("destination command called without a parent")
54
+ if not ctx.parent.info_name:
55
+ raise click.ClickException("parent command missing info name")
56
+ source_cmd = ctx.parent.info_name.replace("-", "_")
57
+ parent_options: dict = ctx.parent.params if ctx.parent else {}
58
+ conform_click_options(options)
59
+ verbose = parent_options.get("verbose", False)
60
+ ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
61
+ log_options(parent_options, verbose=verbose)
62
+ log_options(options, verbose=verbose)
63
+ try:
64
+ self.check_dest_options(options=options)
65
+ runner = self.get_dest_runner(
66
+ source_cmd=source_cmd,
67
+ options=options,
68
+ parent_options=parent_options,
69
+ )
70
+ runner.run(**parent_options)
71
+ except Exception as e:
72
+ logger.error(e, exc_info=True)
73
+ raise click.ClickException(str(e)) from e
74
+
75
+ def get_dest_cmd(self) -> click.Command:
76
+ # Dynamically create the command without the use of click decorators
77
+ fn = self.dest
78
+ fn = click.pass_context(fn)
79
+ cmd: click.Group = click.command(fn)
80
+ cmd.name = self.cmd_name
81
+ cmd.invoke_without_command = True
82
+ options = [self.cli_config] if self.cli_config else []
83
+ options += self.additional_cli_options
84
+ if self.is_fsspec and CliFilesStorageConfig not in options:
85
+ options.append(CliFilesStorageConfig)
86
+ add_options(cmd, extras=options, is_src=False)
87
+ return cmd
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.cmd import BaseCmd
7
+ from unstructured_ingest.cli.common import (
8
+ log_options,
9
+ )
10
+ from unstructured_ingest.cli.interfaces import CliFilesStorageConfig
11
+ from unstructured_ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
12
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
13
+ from unstructured_ingest.runner import runner_map
14
+
15
+
16
+ @dataclass
17
+ class BaseSrcCmd(BaseCmd):
18
+ def get_source_runner(self, options: dict):
19
+ addition_configs = self.addition_configs
20
+ if "connector_config" not in addition_configs:
21
+ addition_configs["connector_config"] = self.cli_config
22
+ configs = extract_configs(
23
+ options,
24
+ validate=[self.cli_config] if self.cli_config else None,
25
+ extras=addition_configs,
26
+ )
27
+ runner = runner_map[self.cmd_name_key]
28
+ return runner(**configs) # type: ignore
29
+
30
+ def src(self, ctx: click.Context, **options):
31
+ if ctx.invoked_subcommand:
32
+ return
33
+
34
+ conform_click_options(options)
35
+ verbose = options.get("verbose", False)
36
+ ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
37
+ log_options(options, verbose=verbose)
38
+ try:
39
+ runner = self.get_source_runner(options=options)
40
+ runner.run(**options)
41
+ except Exception as e:
42
+ logger.error(e, exc_info=True)
43
+ raise click.ClickException(str(e)) from e
44
+
45
+ def get_src_cmd(self) -> click.Group:
46
+ # Dynamically create the command without the use of click decorators
47
+ fn = self.src
48
+ fn = click.pass_context(fn)
49
+ cmd: click.Group = click.group(fn, cls=Group)
50
+ cmd.name = self.cmd_name
51
+ cmd.invoke_without_command = True
52
+ extra_options = [self.cli_config] if self.cli_config else []
53
+ extra_options += self.additional_cli_options
54
+ if self.is_fsspec and CliFilesStorageConfig not in extra_options:
55
+ extra_options.append(CliFilesStorageConfig)
56
+ add_options(cmd, extras=extra_options)
57
+ return cmd