unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,37 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.cli import dest, src
6
+ from unstructured_ingest.v2.cli.cmds import dest as dest_v2
7
+ from unstructured_ingest.v2.cli.cmds import src as src_v2
8
+
9
+ if TYPE_CHECKING:
10
+ from click import Command
11
+
12
+
13
+ @click.group()
14
+ def ingest():
15
+ pass
16
+
17
+
18
+ def get_cmd() -> "Command":
19
+ """Construct and return a Click command object representing the main command for the CLI.
20
+
21
+ This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
22
+ to the main command as nested subcommands.
23
+ """
24
+ cmd = ingest
25
+ src_dict = {s.name: s for s in src}
26
+ dest_dict = {d.name: d for d in dest}
27
+ for s in src_v2:
28
+ src_dict[s.name] = s
29
+ for d in dest_v2:
30
+ dest_dict[d.name] = d
31
+ # Add all subcommands
32
+ for src_subcommand in src_dict.values():
33
+ # Add all destination subcommands
34
+ for dest_subcommand in dest_dict.values():
35
+ src_subcommand.add_command(dest_subcommand)
36
+ cmd.add_command(src_subcommand)
37
+ return cmd
@@ -0,0 +1,12 @@
1
+ import typing as t
2
+
3
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
4
+ from unstructured_ingest.cli.cmds import base_src_cmd_fns
5
+
6
+
7
+ def get_src_cmd_map() -> t.Dict[str, t.Callable[[], BaseSrcCmd]]:
8
+ return {b().cmd_name_key: b for b in base_src_cmd_fns}
9
+
10
+
11
+ def get_src_cmd(cmd_name: str) -> t.Callable[[], BaseSrcCmd]:
12
+ return get_src_cmd_map()[cmd_name]
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import typing as t
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
8
+
9
+ from .airtable import get_base_src_cmd as airtable_base_src_cmd
10
+ from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
11
+ from .astradb import get_base_src_cmd as astradb_base_src_cmd
12
+ from .azure_ai_search import get_base_dest_cmd as azure_ai_search_base_dest_cmd
13
+ from .biomed import get_base_src_cmd as biomed_base_src_cmd
14
+ from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
15
+ from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd
16
+ from .confluence import get_base_src_cmd as confluence_base_src_cmd
17
+ from .databricks_volumes import get_base_dest_cmd as databricks_volumes_dest_cmd
18
+ from .delta_table import get_base_dest_cmd as delta_table_dest_cmd
19
+ from .delta_table import get_base_src_cmd as delta_table_base_src_cmd
20
+ from .discord import get_base_src_cmd as discord_base_src_cmd
21
+ from .elasticsearch import get_base_dest_cmd as elasticsearch_base_dest_cmd
22
+ from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd
23
+ from .fsspec.azure import get_base_dest_cmd as azure_base_dest_cmd
24
+ from .fsspec.azure import get_base_src_cmd as azure_base_src_cmd
25
+ from .fsspec.box import get_base_dest_cmd as box_base_dest_cmd
26
+ from .fsspec.box import get_base_src_cmd as box_base_src_cmd
27
+ from .fsspec.dropbox import get_base_dest_cmd as dropbox_base_dest_cmd
28
+ from .fsspec.dropbox import get_base_src_cmd as dropbox_base_src_cmd
29
+ from .fsspec.fsspec import get_base_dest_cmd as fsspec_base_dest_cmd
30
+ from .fsspec.fsspec import get_base_src_cmd as fsspec_base_src_cmd
31
+ from .fsspec.gcs import get_base_dest_cmd as gcs_base_dest_cmd
32
+ from .fsspec.gcs import get_base_src_cmd as gcs_base_src_cmd
33
+ from .fsspec.s3 import get_base_dest_cmd as s3_base_dest_cmd
34
+ from .fsspec.s3 import get_base_src_cmd as s3_base_src_cmd
35
+ from .github import get_base_src_cmd as github_base_src_cmd
36
+ from .gitlab import get_base_src_cmd as gitlab_base_src_cmd
37
+ from .google_drive import get_base_src_cmd as google_drive_base_src_cmd
38
+ from .hubspot import get_base_src_cmd as hubspot_base_src_cmd
39
+ from .jira import get_base_src_cmd as jira_base_src_cmd
40
+ from .kafka import get_base_dest_cmd as kafka_base_dest_cmd
41
+ from .kafka import get_base_src_cmd as kafka_base_src_cmd
42
+ from .local import get_base_src_cmd as local_base_src_cmd
43
+ from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
44
+ from .mongodb import get_base_src_cmd as mongodb_base_src_cmd
45
+ from .notion import get_base_src_cmd as notion_base_src_cmd
46
+ from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
47
+ from .opensearch import get_base_dest_cmd as opensearch_base_dest_cmd
48
+ from .opensearch import get_base_src_cmd as opensearch_base_src_cmd
49
+ from .outlook import get_base_src_cmd as outlook_base_src_cmd
50
+ from .pinecone import get_base_dest_cmd as pinecone_base_dest_cmd
51
+ from .qdrant import get_base_dest_cmd as qdrant_base_dest_cmd
52
+ from .reddit import get_base_src_cmd as reddit_base_src_cmd
53
+ from .salesforce import get_base_src_cmd as salesforce_base_src_cmd
54
+ from .sharepoint import get_base_src_cmd as sharepoint_base_src_cmd
55
+ from .slack import get_base_src_cmd as slack_base_src_cmd
56
+ from .sql import get_base_dest_cmd as sql_base_dest_cmd
57
+ from .vectara import get_base_dest_cmd as vectara_base_dest_cmd
58
+ from .weaviate import get_base_dest_cmd as weaviate_dest_cmd
59
+ from .wikipedia import get_base_src_cmd as wikipedia_base_src_cmd
60
+
61
+ if t.TYPE_CHECKING:
62
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
63
+
64
+ base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
65
+ airtable_base_src_cmd,
66
+ astradb_base_src_cmd,
67
+ azure_base_src_cmd,
68
+ biomed_base_src_cmd,
69
+ box_base_src_cmd,
70
+ confluence_base_src_cmd,
71
+ delta_table_base_src_cmd,
72
+ discord_base_src_cmd,
73
+ dropbox_base_src_cmd,
74
+ elasticsearch_base_src_cmd,
75
+ fsspec_base_src_cmd,
76
+ gcs_base_src_cmd,
77
+ github_base_src_cmd,
78
+ gitlab_base_src_cmd,
79
+ google_drive_base_src_cmd,
80
+ hubspot_base_src_cmd,
81
+ jira_base_src_cmd,
82
+ kafka_base_src_cmd,
83
+ local_base_src_cmd,
84
+ mongodb_base_src_cmd,
85
+ notion_base_src_cmd,
86
+ onedrive_base_src_cmd,
87
+ opensearch_base_src_cmd,
88
+ outlook_base_src_cmd,
89
+ reddit_base_src_cmd,
90
+ salesforce_base_src_cmd,
91
+ sftp_base_src_cmd,
92
+ sharepoint_base_src_cmd,
93
+ slack_base_src_cmd,
94
+ s3_base_src_cmd,
95
+ wikipedia_base_src_cmd,
96
+ ]
97
+
98
+ # Make sure there are not overlapping names
99
+ src_cmd_names = [b().cmd_name for b in base_src_cmd_fns]
100
+ src_duplicates = [item for item, count in collections.Counter(src_cmd_names).items() if count > 1]
101
+ if src_duplicates:
102
+ raise ValueError(
103
+ "multiple base src commands defined with the same names: {}".format(
104
+ ", ".join(src_duplicates),
105
+ ),
106
+ )
107
+
108
+ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
109
+ astradb_base_dest_cmd,
110
+ azure_base_dest_cmd,
111
+ box_base_dest_cmd,
112
+ chroma_base_dest_cmd,
113
+ clarifai_base_dest_cmd,
114
+ databricks_volumes_dest_cmd,
115
+ dropbox_base_dest_cmd,
116
+ elasticsearch_base_dest_cmd,
117
+ fsspec_base_dest_cmd,
118
+ gcs_base_dest_cmd,
119
+ kafka_base_dest_cmd,
120
+ s3_base_dest_cmd,
121
+ azure_ai_search_base_dest_cmd,
122
+ delta_table_dest_cmd,
123
+ sql_base_dest_cmd,
124
+ weaviate_dest_cmd,
125
+ mongo_base_dest_cmd,
126
+ pinecone_base_dest_cmd,
127
+ qdrant_base_dest_cmd,
128
+ opensearch_base_dest_cmd,
129
+ vectara_base_dest_cmd,
130
+ ]
131
+
132
+ # Make sure there are not overlapping names
133
+ dest_cmd_names = [b().cmd_name for b in base_dest_cmd_fns]
134
+ dest_duplicates = [item for item, count in collections.Counter(dest_cmd_names).items() if count > 1]
135
+ if dest_duplicates:
136
+ raise ValueError(
137
+ "multiple base dest commands defined with the same names: {}".format(
138
+ ", ".join(dest_duplicates),
139
+ ),
140
+ )
141
+
142
+ __all__ = [
143
+ "base_src_cmd_fns",
144
+ "base_dest_cmd_fns",
145
+ ]
@@ -0,0 +1,69 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.airtable import SimpleAirtableConfig
11
+
12
+
13
+ @dataclass
14
+ class AirtableCliConfig(SimpleAirtableConfig, CliConfig):
15
+ @staticmethod
16
+ def get_cli_options() -> t.List[click.Option]:
17
+ options = [
18
+ click.Option(
19
+ ["--personal-access-token"],
20
+ default=None,
21
+ help="Personal access token to authenticate into Airtable. Check: "
22
+ "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
23
+ "for more info",
24
+ ),
25
+ click.Option(
26
+ ["--list-of-paths"],
27
+ default=None,
28
+ help="""
29
+ A list of paths that specify the locations to ingest data from within Airtable.
30
+
31
+ If this argument is not set, the connector ingests all tables within each and every base.
32
+ --list-of-paths: path1 path2 path3 ….
33
+ path: base_id/table_id(optional)/view_id(optional)/
34
+
35
+ To obtain (base, table, view) ids in bulk, check:
36
+ https://airtable.com/developers/web/api/list-bases (base ids)
37
+ https://airtable.com/developers/web/api/get-base-schema (table and view ids)
38
+ https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
39
+
40
+ To obtain specific ids from Airtable UI, go to your workspace, and copy any
41
+ relevant id from the URL structure:
42
+ https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
43
+ appAbcDeF1ghijKlm -> base_id
44
+ tblABcdEfG1HIJkLm -> table_id
45
+ viwABCDEfg6hijKLM -> view_id
46
+
47
+ You can also check: https://support.airtable.com/docs/finding-airtable-ids
48
+
49
+ Here is an example for one --list-of-paths:
50
+ base1/ → gets the entirety of all tables inside base1
51
+ base1/table1 → gets all rows and columns within table1 in base1
52
+ base1/table1/view1 → gets the rows and columns that are
53
+ visible in view1 for the table1 in base1
54
+
55
+ Examples to invalid airtable_paths:
56
+ table1 → has to mention base to be valid
57
+ base1/view1 → has to mention table to be valid
58
+ """,
59
+ ),
60
+ ]
61
+ return options
62
+
63
+
64
+ def get_base_src_cmd() -> BaseSrcCmd:
65
+ cmd_cls = BaseSrcCmd(
66
+ cmd_name="airtable",
67
+ cli_config=AirtableCliConfig,
68
+ )
69
+ return cmd_cls
@@ -0,0 +1,99 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import CliConfig, Dict
7
+ from unstructured_ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
8
+
9
+
10
+ @dataclass
11
+ class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
12
+ @staticmethod
13
+ def get_cli_options() -> t.List[click.Option]:
14
+ options = [
15
+ click.Option(
16
+ ["--token"],
17
+ required=True,
18
+ type=str,
19
+ help="Astra DB Token with access to the database.",
20
+ envvar="ASTRA_DB_APPLICATION_TOKEN",
21
+ show_envvar=True,
22
+ ),
23
+ click.Option(
24
+ ["--api-endpoint"],
25
+ required=True,
26
+ type=str,
27
+ help="The API endpoint for the Astra DB.",
28
+ envvar="ASTRA_DB_API_ENDPOINT",
29
+ show_envvar=True,
30
+ ),
31
+ click.Option(
32
+ ["--collection-name"],
33
+ required=False,
34
+ type=str,
35
+ help="The name of the Astra DB collection. "
36
+ "Note that the collection name must only include letters, "
37
+ "numbers, and underscores.",
38
+ ),
39
+ click.Option(
40
+ ["--keyspace"],
41
+ required=False,
42
+ default=None,
43
+ type=str,
44
+ help="The Astra DB connection keyspace.",
45
+ ),
46
+ ]
47
+ return options
48
+
49
+
50
+ @dataclass
51
+ class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
52
+ @staticmethod
53
+ def get_cli_options() -> t.List[click.Option]:
54
+ options = [
55
+ click.Option(
56
+ ["--embedding-dimension"],
57
+ required=True,
58
+ default=384,
59
+ type=int,
60
+ help="The dimensionality of the embeddings",
61
+ ),
62
+ click.Option(
63
+ ["--requested-indexing-policy"],
64
+ required=False,
65
+ default=None,
66
+ type=Dict(),
67
+ help="The indexing policy to use for the collection."
68
+ 'example: \'{"deny": ["metadata"]}\' ',
69
+ ),
70
+ click.Option(
71
+ ["--batch-size"],
72
+ default=20,
73
+ type=int,
74
+ help="Number of records per batch",
75
+ ),
76
+ ]
77
+ return options
78
+
79
+
80
+ def get_base_src_cmd():
81
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
82
+
83
+ cmd_cls = BaseSrcCmd(
84
+ cmd_name="astradb",
85
+ cli_config=AstraDBCliConfig,
86
+ )
87
+ return cmd_cls
88
+
89
+
90
+ def get_base_dest_cmd():
91
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
92
+
93
+ cmd_cls = BaseDestCmd(
94
+ cmd_name="astradb",
95
+ cli_config=AstraDBCliConfig,
96
+ additional_cli_options=[AstraDBCliWriteConfig],
97
+ write_config=AstraDBWriteConfig,
98
+ )
99
+ return cmd_cls
@@ -0,0 +1,65 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import (
7
+ CliConfig,
8
+ )
9
+ from unstructured_ingest.connector.azure_ai_search import (
10
+ AzureAISearchWriteConfig,
11
+ SimpleAzureAISearchStorageConfig,
12
+ )
13
+
14
+
15
+ @dataclass
16
+ class AzureAISearchCliConfig(SimpleAzureAISearchStorageConfig, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--key"],
22
+ required=True,
23
+ type=str,
24
+ help="Key credential used for authenticating to an Azure service.",
25
+ envvar="AZURE_SEARCH_API_KEY",
26
+ show_envvar=True,
27
+ ),
28
+ click.Option(
29
+ ["--endpoint"],
30
+ required=True,
31
+ type=str,
32
+ help="The URL endpoint of an Azure search service. "
33
+ "In the form of https://{{service_name}}.search.windows.net",
34
+ envvar="AZURE_SEARCH_ENDPOINT",
35
+ show_envvar=True,
36
+ ),
37
+ ]
38
+ return options
39
+
40
+
41
+ @dataclass
42
+ class AzureAISearchCliWriteConfig(AzureAISearchWriteConfig, CliConfig):
43
+ @staticmethod
44
+ def get_cli_options() -> t.List[click.Option]:
45
+ options = [
46
+ click.Option(
47
+ ["--index"],
48
+ required=True,
49
+ type=str,
50
+ help="The name of the index to connect to",
51
+ ),
52
+ ]
53
+ return options
54
+
55
+
56
+ def get_base_dest_cmd():
57
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
58
+
59
+ cmd_cls = BaseDestCmd(
60
+ cmd_name="azure-ai-search",
61
+ cli_config=AzureAISearchCliConfig,
62
+ additional_cli_options=[AzureAISearchCliWriteConfig],
63
+ write_config=AzureAISearchCliWriteConfig,
64
+ )
65
+ return cmd_cls
@@ -0,0 +1,52 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.biomed import SimpleBiomedConfig
11
+
12
+
13
+ @dataclass
14
+ class BiomedCliConfig(SimpleBiomedConfig, CliConfig):
15
+ @staticmethod
16
+ def get_cli_options() -> t.List[click.Option]:
17
+ options = [
18
+ click.Option(
19
+ ["--api-id"],
20
+ default=None,
21
+ help="ID parameter for OA Web Service API.",
22
+ ),
23
+ click.Option(
24
+ ["--api-from"],
25
+ default=None,
26
+ help="From parameter for OA Web Service API.",
27
+ ),
28
+ click.Option(
29
+ ["--api-until"],
30
+ default=None,
31
+ help="Until parameter for OA Web Service API.",
32
+ ),
33
+ click.Option(
34
+ ["--path"],
35
+ default=None,
36
+ help="PMC Open Access FTP Directory Path.",
37
+ ),
38
+ click.Option(
39
+ ["--max-request-time"],
40
+ default=45,
41
+ help="(In seconds) Max request time to OA Web Service API.",
42
+ ),
43
+ ]
44
+ return options
45
+
46
+
47
+ def get_base_src_cmd() -> BaseSrcCmd:
48
+ cmd_cls = BaseSrcCmd(
49
+ cmd_name="biomed",
50
+ cli_config=BiomedCliConfig,
51
+ )
52
+ return cmd_cls
@@ -0,0 +1,104 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import CliConfig, Dict
7
+ from unstructured_ingest.connector.chroma import ChromaWriteConfig, SimpleChromaConfig
8
+
9
+
10
+ @dataclass
11
+ class ChromaCliConfig(SimpleChromaConfig, CliConfig):
12
+ @staticmethod
13
+ def get_cli_options() -> t.List[click.Option]:
14
+ options = [
15
+ click.Option(
16
+ ["--path"],
17
+ required=False,
18
+ type=str,
19
+ help="Location where Chroma is persisted," "if not connecting via http.",
20
+ ),
21
+ click.Option(
22
+ ["--settings"],
23
+ required=False,
24
+ type=Dict(),
25
+ help="A dictionary of settings to communicate with the chroma server."
26
+ 'example: \'{"persist_directory":"./chroma-persist"}\' ',
27
+ ),
28
+ click.Option(
29
+ ["--tenant"],
30
+ required=False,
31
+ default="default_tenant",
32
+ type=str,
33
+ help="The tenant to use for this client. Chroma defaults to 'default_tenant'.",
34
+ ),
35
+ click.Option(
36
+ ["--database"],
37
+ required=False,
38
+ default="default_database",
39
+ type=str,
40
+ help="The database to use for this client."
41
+ "Chroma defaults to 'default_database'.",
42
+ ),
43
+ click.Option(
44
+ ["--host"],
45
+ required=False,
46
+ type=str,
47
+ help="The hostname of the Chroma server.",
48
+ ),
49
+ click.Option(
50
+ ["--port"],
51
+ required=False,
52
+ type=int,
53
+ help="The port of the Chroma server.",
54
+ ),
55
+ click.Option(
56
+ ["--ssl"],
57
+ required=False,
58
+ default=False,
59
+ is_flag=True,
60
+ type=bool,
61
+ help="Whether to use SSL to connect to the Chroma server.",
62
+ ),
63
+ click.Option(
64
+ ["--headers"],
65
+ required=False,
66
+ type=Dict(),
67
+ help="A dictionary of headers to send to the Chroma server."
68
+ 'example: \'{"Authorization":"Basic()"}\' ',
69
+ ),
70
+ click.Option(
71
+ ["--collection-name"],
72
+ required=True,
73
+ type=str,
74
+ help="The name of the Chroma collection to write into.",
75
+ ),
76
+ ]
77
+ return options
78
+
79
+
80
+ @dataclass
81
+ class ChromaCliWriteConfig(ChromaWriteConfig, CliConfig):
82
+ @staticmethod
83
+ def get_cli_options() -> t.List[click.Option]:
84
+ options = [
85
+ click.Option(
86
+ ["--batch-size"],
87
+ default=100,
88
+ type=int,
89
+ help="Number of records per batch",
90
+ ),
91
+ ]
92
+ return options
93
+
94
+
95
+ def get_base_dest_cmd():
96
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
97
+
98
+ cmd_cls = BaseDestCmd(
99
+ cmd_name="chroma",
100
+ cli_config=ChromaCliConfig,
101
+ additional_cli_options=[ChromaCliWriteConfig],
102
+ write_config=ChromaWriteConfig,
103
+ )
104
+ return cmd_cls
@@ -0,0 +1,71 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import CliConfig
7
+ from unstructured_ingest.connector.clarifai import (
8
+ ClarifaiWriteConfig,
9
+ SimpleClarifaiConfig,
10
+ )
11
+
12
+ CMD_NAME = "clarifai"
13
+
14
+
15
+ @dataclass
16
+ class ClarifaiCliConfig(SimpleClarifaiConfig, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--api-key"],
22
+ required=True,
23
+ type=str,
24
+ help="The CLARIFAI_PAT of the user to access clarifai platform apps and models",
25
+ envvar="CLARIFAI_PAT",
26
+ show_envvar=True,
27
+ ),
28
+ click.Option(
29
+ ["--app-id"],
30
+ required=True,
31
+ type=str,
32
+ help="Clarifai app name/id",
33
+ ),
34
+ click.Option(
35
+ ["--user-id"],
36
+ required=True,
37
+ type=str,
38
+ help="Clarifai User name/ID",
39
+ ),
40
+ click.Option(
41
+ ["--dataset-id"], type=str, default=None, help="Clarifai App Dataset ID (optional)"
42
+ ),
43
+ ]
44
+ return options
45
+
46
+
47
+ @dataclass
48
+ class ClarifaiCliWriteConfig(ClarifaiWriteConfig, CliConfig):
49
+ @staticmethod
50
+ def get_cli_options() -> t.List[click.option]:
51
+ options = [
52
+ click.Option(
53
+ ["--batch-size"],
54
+ type=int,
55
+ default=50,
56
+ help="No of inputs upload per batch",
57
+ ),
58
+ ]
59
+ return options
60
+
61
+
62
+ def get_base_dest_cmd():
63
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
64
+
65
+ cmd_cls = BaseDestCmd(
66
+ cmd_name=CMD_NAME,
67
+ cli_config=ClarifaiCliConfig,
68
+ additional_cli_options=[ClarifaiCliWriteConfig],
69
+ write_config=ClarifaiWriteConfig,
70
+ )
71
+ return cmd_cls