unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,73 @@
1
+ from typing import Any
2
+
3
+ import pandas as pd
4
+
5
+ from unstructured_ingest.utils.data_prep import flatten_dict
6
+
7
+
8
+ def get_default_pandas_dtypes() -> dict[str, Any]:
9
+ return {
10
+ "text": pd.StringDtype(), # type: ignore
11
+ "type": pd.StringDtype(), # type: ignore
12
+ "element_id": pd.StringDtype(), # type: ignore
13
+ "filename": pd.StringDtype(), # Optional[str] # type: ignore
14
+ "filetype": pd.StringDtype(), # Optional[str] # type: ignore
15
+ "file_directory": pd.StringDtype(), # Optional[str] # type: ignore
16
+ "last_modified": pd.StringDtype(), # Optional[str] # type: ignore
17
+ "attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
18
+ "parent_id": pd.StringDtype(), # Optional[str], # type: ignore
19
+ "category_depth": "Int64", # Optional[int]
20
+ "image_path": pd.StringDtype(), # Optional[str] # type: ignore
21
+ "languages": object, # Optional[list[str]]
22
+ "page_number": "Int64", # Optional[int]
23
+ "page_name": pd.StringDtype(), # Optional[str] # type: ignore
24
+ "url": pd.StringDtype(), # Optional[str] # type: ignore
25
+ "link_urls": pd.StringDtype(), # Optional[str] # type: ignore
26
+ "link_texts": object, # Optional[list[str]]
27
+ "links": object,
28
+ "sent_from": object, # Optional[list[str]],
29
+ "sent_to": object, # Optional[list[str]]
30
+ "subject": pd.StringDtype(), # Optional[str] # type: ignore
31
+ "section": pd.StringDtype(), # Optional[str] # type: ignore
32
+ "header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
33
+ "emphasized_text_contents": object, # Optional[list[str]]
34
+ "emphasized_text_tags": object, # Optional[list[str]]
35
+ "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
36
+ "regex_metadata": object,
37
+ "max_characters": "Int64", # Optional[int]
38
+ "is_continuation": "boolean", # Optional[bool]
39
+ "detection_class_prob": float, # Optional[float],
40
+ "sender": pd.StringDtype(), # type: ignore
41
+ "coordinates_points": object,
42
+ "coordinates_system": pd.StringDtype(), # type: ignore
43
+ "coordinates_layout_width": float,
44
+ "coordinates_layout_height": float,
45
+ "data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
46
+ "data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
47
+ "data_source_record_locator": object,
48
+ "data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
49
+ "data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
50
+ "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
51
+ "data_source_permissions_data": object,
52
+ "embeddings": object,
53
+ "regex_metadata_key": object,
54
+ }
55
+
56
+
57
+ def convert_to_pandas_dataframe(
58
+ elements_dict: list[dict[str, Any]],
59
+ drop_empty_cols: bool = False,
60
+ ) -> pd.DataFrame:
61
+ # Flatten metadata if it hasn't already been flattened
62
+ for d in elements_dict:
63
+ if metadata := d.pop("metadata", None):
64
+ d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
65
+
66
+ df = pd.DataFrame.from_dict(
67
+ elements_dict,
68
+ )
69
+ dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns}
70
+ df = df.astype(dt)
71
+ if drop_empty_cols:
72
+ df.dropna(axis=1, how="all", inplace=True)
73
+ return df
@@ -0,0 +1 @@
1
+ from __future__ import annotations
File without changes
@@ -0,0 +1,4 @@
1
+ from .dest import DestCmd
2
+ from .src import SrcCmd
3
+
4
+ __all__ = ["SrcCmd", "DestCmd"]
@@ -0,0 +1,215 @@
1
+ import inspect
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass, field, fields
4
+ from typing import Any, Optional, Type, TypeVar
5
+
6
+ import click
7
+
8
+ from unstructured_ingest.v2.cli.base.importer import import_from_string
9
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
10
+ from unstructured_ingest.v2.cli.utils import extract_config
11
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
12
+ from unstructured_ingest.v2.logger import logger
13
+ from unstructured_ingest.v2.pipeline.pipeline import Pipeline
14
+ from unstructured_ingest.v2.processes.chunker import Chunker, ChunkerConfig
15
+ from unstructured_ingest.v2.processes.connector_registry import (
16
+ DownloaderT,
17
+ IndexerT,
18
+ UploaderT,
19
+ UploadStager,
20
+ UploadStagerConfig,
21
+ UploadStagerT,
22
+ destination_registry,
23
+ source_registry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
26
+ from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
27
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
28
+
29
+ CommandT = TypeVar("CommandT", bound=click.Command)
30
+
31
+
32
+ @dataclass
33
+ class BaseCmd(ABC):
34
+ cmd_name: str
35
+ default_configs: list[Type[CliConfig]] = field(default_factory=list)
36
+
37
+ @property
38
+ def cmd_name_key(self):
39
+ return self.cmd_name.replace("-", "_")
40
+
41
+ @property
42
+ def cli_cmd_name(self):
43
+ return self.cmd_name.replace("_", "-")
44
+
45
+ @abstractmethod
46
+ def cmd(self, ctx: click.Context, **options) -> None:
47
+ pass
48
+
49
+ def add_options(self, cmd: CommandT, extras: list[Type[CliConfig]]) -> CommandT:
50
+ configs = self.default_configs
51
+ # make sure what's unique to this cmd appears first
52
+ extras.extend(configs)
53
+ for config in extras:
54
+ try:
55
+ config.add_cli_options(cmd=cmd)
56
+ except ValueError as e:
57
+ raise ValueError(f"failed to set configs from {config.__name__}: {e}")
58
+ return cmd
59
+
60
+ def get_pipline(
61
+ self,
62
+ src: str,
63
+ source_options: dict[str, Any],
64
+ dest: Optional[str] = None,
65
+ destination_options: Optional[dict[str, Any]] = None,
66
+ ) -> Pipeline:
67
+ logger.debug(
68
+ f"creating pipeline from cli using source {src} with options: {source_options}"
69
+ )
70
+ pipeline_kwargs: dict[str, Any] = {
71
+ "context": self.get_processor_config(options=source_options),
72
+ "downloader": self.get_downloader(src=src, options=source_options),
73
+ "indexer": self.get_indexer(src=src, options=source_options),
74
+ "partitioner": self.get_partitioner(options=source_options),
75
+ }
76
+ if chunker := self.get_chunker(options=source_options):
77
+ pipeline_kwargs["chunker"] = chunker
78
+ if embedder := self.get_embeder(options=source_options):
79
+ pipeline_kwargs["embedder"] = embedder
80
+ if dest:
81
+ logger.debug(
82
+ f"setting destination on pipeline {dest} with options: {destination_options}"
83
+ )
84
+ if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
85
+ pipeline_kwargs["stager"] = uploader_stager
86
+ pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
87
+ else:
88
+ # Default to local uploader
89
+ # TODO remove after v1 no longer supported
90
+ destination_options = destination_options or {}
91
+ if "output_dir" not in destination_options:
92
+ destination_options["output_dir"] = source_options["output_dir"]
93
+ pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options)
94
+ return Pipeline(**pipeline_kwargs)
95
+
96
+ @staticmethod
97
+ def get_default_uploader(options: dict[str, Any]) -> UploaderT:
98
+ uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig)
99
+ return LocalUploader(upload_config=uploader_config)
100
+
101
+ @staticmethod
102
+ def get_chunker(options: dict[str, Any]) -> Optional[Chunker]:
103
+ chunker_config = extract_config(flat_data=options, config=ChunkerConfig)
104
+ if not chunker_config.chunking_strategy:
105
+ return None
106
+ return Chunker(config=chunker_config)
107
+
108
+ @staticmethod
109
+ def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
110
+ embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
111
+ if not embedder_config.embedding_provider:
112
+ return None
113
+ return Embedder(config=embedder_config)
114
+
115
+ @staticmethod
116
+ def get_partitioner(options: dict[str, Any]) -> Partitioner:
117
+ partitioner_config = extract_config(flat_data=options, config=PartitionerConfig)
118
+ return Partitioner(config=partitioner_config)
119
+
120
+ @staticmethod
121
+ def get_processor_config(options: dict[str, Any]) -> ProcessorConfig:
122
+ return extract_config(flat_data=options, config=ProcessorConfig)
123
+
124
+ @staticmethod
125
+ def get_indexer(src: str, options: dict[str, Any]) -> IndexerT:
126
+ source_entry = source_registry[src]
127
+ indexer_kwargs: dict[str, Any] = {}
128
+ if indexer_config_cls := source_entry.indexer_config:
129
+ indexer_kwargs["index_config"] = extract_config(
130
+ flat_data=options, config=indexer_config_cls
131
+ )
132
+ if connection_config_cls := source_entry.connection_config:
133
+ indexer_kwargs["connection_config"] = extract_config(
134
+ flat_data=options, config=connection_config_cls
135
+ )
136
+ indexer_cls = source_entry.indexer
137
+ return indexer_cls(**indexer_kwargs)
138
+
139
+ @staticmethod
140
+ def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT:
141
+ source_entry = source_registry[src]
142
+ downloader_kwargs: dict[str, Any] = {}
143
+ if downloader_config_cls := source_entry.downloader_config:
144
+ downloader_kwargs["download_config"] = extract_config(
145
+ flat_data=options, config=downloader_config_cls
146
+ )
147
+ if connection_config_cls := source_entry.connection_config:
148
+ downloader_kwargs["connection_config"] = extract_config(
149
+ flat_data=options, config=connection_config_cls
150
+ )
151
+ downloader_cls = source_entry.downloader
152
+ return downloader_cls(**downloader_kwargs)
153
+
154
+ @staticmethod
155
+ def get_custom_stager(
156
+ stager_reference: str, stager_config_kwargs: Optional[dict] = None
157
+ ) -> Optional[UploadStagerT]:
158
+ uploader_cls = import_from_string(stager_reference)
159
+ if not inspect.isclass(uploader_cls):
160
+ raise ValueError(
161
+ f"custom stager must be a reference to a python class, got: {type(uploader_cls)}"
162
+ )
163
+ if not issubclass(uploader_cls, UploadStager):
164
+ raise ValueError(
165
+ "custom stager must be an implementation of the UploadStager interface"
166
+ )
167
+ fields_dict = {f.name: f.type for f in fields(uploader_cls)}
168
+ upload_stager_config_cls = fields_dict["upload_stager_config"]
169
+ if not inspect.isclass(upload_stager_config_cls):
170
+ raise ValueError(
171
+ f"custom stager config must be a class, got: {type(upload_stager_config_cls)}"
172
+ )
173
+ if not issubclass(upload_stager_config_cls, UploadStagerConfig):
174
+ raise ValueError(
175
+ "custom stager config must be an implementation "
176
+ "of the UploadStagerUploadStagerConfig interface"
177
+ )
178
+ upload_stager_kwargs: dict[str, Any] = {}
179
+ if stager_config_kwargs:
180
+ upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls(
181
+ **stager_config_kwargs
182
+ )
183
+ return uploader_cls(**upload_stager_kwargs)
184
+
185
+ @staticmethod
186
+ def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]:
187
+ if custom_stager := options.get("custom_stager"):
188
+ return BaseCmd.get_custom_stager(
189
+ stager_reference=custom_stager,
190
+ stager_config_kwargs=options.get("custom_stager_config_kwargs"),
191
+ )
192
+ dest_entry = destination_registry[dest]
193
+ upload_stager_kwargs: dict[str, Any] = {}
194
+ if upload_stager_config_cls := dest_entry.upload_stager_config:
195
+ upload_stager_kwargs["upload_stager_config"] = extract_config(
196
+ flat_data=options, config=upload_stager_config_cls
197
+ )
198
+ if upload_stager_cls := dest_entry.upload_stager:
199
+ return upload_stager_cls(**upload_stager_kwargs)
200
+ return None
201
+
202
+ @staticmethod
203
+ def get_uploader(dest, options: dict[str, Any]) -> UploaderT:
204
+ dest_entry = destination_registry[dest]
205
+ uploader_kwargs: dict[str, Any] = {}
206
+ if uploader_config_cls := dest_entry.uploader_config:
207
+ uploader_kwargs["upload_config"] = extract_config(
208
+ flat_data=options, config=uploader_config_cls
209
+ )
210
+ if connection_config_cls := dest_entry.connection_config:
211
+ uploader_kwargs["connection_config"] = extract_config(
212
+ flat_data=options, config=connection_config_cls
213
+ )
214
+ uploader_cls = dest_entry.uploader
215
+ return uploader_cls(**uploader_kwargs)
@@ -0,0 +1,76 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from typing import Optional, Type
4
+
5
+ import click
6
+
7
+ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
9
+ from unstructured_ingest.v2.cli.utils import Dict, conform_click_options
10
+ from unstructured_ingest.v2.logger import logger
11
+
12
+
13
+ @dataclass
14
+ class DestCmd(BaseCmd):
15
+ connection_config: Optional[Type[CliConfig]] = None
16
+ uploader_config: Optional[Type[CliConfig]] = None
17
+ upload_stager_config: Optional[Type[CliConfig]] = None
18
+
19
+ def cmd(self, ctx: click.Context, **options) -> None:
20
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
21
+ if not ctx.parent:
22
+ raise click.ClickException("destination command called without a parent")
23
+ if not ctx.parent.info_name:
24
+ raise click.ClickException("parent command missing info name")
25
+ source_cmd = ctx.parent.info_name.replace("-", "_")
26
+ source_options: dict = ctx.parent.params if ctx.parent else {}
27
+ conform_click_options(options)
28
+ try:
29
+ pipeline = self.get_pipline(
30
+ src=source_cmd,
31
+ source_options=source_options,
32
+ dest=self.cmd_name,
33
+ destination_options=options,
34
+ )
35
+ pipeline.run()
36
+ except Exception as e:
37
+ logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True)
38
+ raise click.ClickException(str(e)) from e
39
+
40
+ def get_cmd(self) -> click.Command:
41
+ # Dynamically create the command without the use of click decorators
42
+ fn = self.cmd
43
+ fn = click.pass_context(fn)
44
+ cmd = click.command(fn)
45
+ if not isinstance(cmd, click.core.Command):
46
+ raise ValueError(f"generated command was not of expected type Command: {type(cmd)}")
47
+ cmd.name = self.cli_cmd_name
48
+ cmd.short_help = "v2"
49
+ cmd.invoke_without_command = True
50
+ extras = [
51
+ x
52
+ for x in [self.uploader_config, self.upload_stager_config, self.connection_config]
53
+ if x
54
+ ]
55
+ self.add_options(cmd, extras=extras)
56
+ cmd.params.append(
57
+ click.Option(
58
+ ["--custom-stager"],
59
+ required=False,
60
+ type=str,
61
+ default=None,
62
+ help="Pass a pointer to a custom upload stager to use, "
63
+ "must be in format '<module>:<attribute>'",
64
+ )
65
+ )
66
+ cmd.params.append(
67
+ click.Option(
68
+ ["--custom-stager-config-kwargs"],
69
+ required=False,
70
+ type=Dict(),
71
+ default=None,
72
+ help="Any kwargs to instantiate the configuration "
73
+ "associated with the customer stager",
74
+ )
75
+ )
76
+ return cmd
@@ -0,0 +1,34 @@
1
+ import importlib
2
+ from typing import Any
3
+
4
+
5
+ class ImportFromStringError(Exception):
6
+ pass
7
+
8
+
9
+ def import_from_string(import_str: Any) -> Any:
10
+ if not isinstance(import_str, str):
11
+ return import_str
12
+
13
+ module_str, _, attrs_str = import_str.partition(":")
14
+ if not module_str or not attrs_str:
15
+ message = 'Import string "{import_str}" must be in format "<module>:<attribute>".'
16
+ raise ImportFromStringError(message.format(import_str=import_str))
17
+
18
+ try:
19
+ module = importlib.import_module(module_str)
20
+ except ModuleNotFoundError as exc:
21
+ if exc.name != module_str:
22
+ raise exc from None
23
+ message = 'Could not import module "{module_str}".'
24
+ raise ImportFromStringError(message.format(module_str=module_str))
25
+
26
+ instance = module
27
+ try:
28
+ for attr_str in attrs_str.split("."):
29
+ instance = getattr(instance, attr_str)
30
+ except AttributeError:
31
+ message = 'Attribute "{attrs_str}" not found in module "{module_str}".'
32
+ raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str))
33
+
34
+ return instance
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Optional, Type
4
+
5
+ import click
6
+
7
+ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
8
+ from unstructured_ingest.v2.cli.configs import (
9
+ ChunkerCliConfig,
10
+ EmbedderCliConfig,
11
+ PartitionerCliConfig,
12
+ ProcessorCliConfig,
13
+ )
14
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
15
+ from unstructured_ingest.v2.cli.utils import Group, conform_click_options
16
+ from unstructured_ingest.v2.logger import logger
17
+
18
+
19
+ @dataclass
20
+ class SrcCmd(BaseCmd):
21
+ indexer_config: Optional[Type[CliConfig]] = None
22
+ downloader_config: Optional[Type[CliConfig]] = None
23
+ connection_config: Optional[Type[CliConfig]] = None
24
+ default_configs: list[CliConfig] = field(
25
+ default_factory=lambda: [
26
+ ProcessorCliConfig,
27
+ PartitionerCliConfig,
28
+ EmbedderCliConfig,
29
+ ChunkerCliConfig,
30
+ ]
31
+ )
32
+
33
+ def cmd(self, ctx: click.Context, **options) -> None:
34
+ if ctx.invoked_subcommand:
35
+ return
36
+
37
+ conform_click_options(options)
38
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
39
+ try:
40
+ pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
41
+ pipeline.run()
42
+ except Exception as e:
43
+ logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
44
+ raise click.ClickException(str(e)) from e
45
+
46
+ def get_cmd(self) -> click.Group:
47
+ # Dynamically create the command without the use of click decorators
48
+ fn = self.cmd
49
+ fn = click.pass_context(fn)
50
+ cmd = click.group(fn, cls=Group)
51
+ if not isinstance(cmd, click.core.Group):
52
+ raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
53
+ cmd.name = self.cli_cmd_name
54
+ cmd.short_help = "v2"
55
+ cmd.invoke_without_command = True
56
+ extras = [
57
+ x for x in [self.indexer_config, self.downloader_config, self.connection_config] if x
58
+ ]
59
+ self.add_options(cmd, extras=extras)
60
+
61
+ # TODO remove after v1 no longer supported
62
+ cmd.params.append(
63
+ click.Option(
64
+ ["--output-dir"],
65
+ required=False,
66
+ type=str,
67
+ help="Local path to write partitioned output to",
68
+ )
69
+ )
70
+ return cmd
@@ -0,0 +1,24 @@
1
+ import click
2
+
3
+ from unstructured_ingest.v2.cli.cmds import dest, src
4
+
5
+
6
+ @click.group()
7
+ def ingest():
8
+ pass
9
+
10
+
11
+ def get_cmd() -> click.Command:
12
+ """Construct and return a Click command object representing the main command for the CLI.
13
+
14
+ This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
15
+ to the main command as nested subcommands.
16
+ """
17
+ cmd = ingest
18
+ # Add all subcommands
19
+ for src_subcommand in src:
20
+ # Add all destination subcommands
21
+ for dest_subcommand in dest:
22
+ src_subcommand.add_command(dest_subcommand)
23
+ cmd.add_command(src_subcommand)
24
+ return cmd
@@ -0,0 +1,87 @@
1
+ from collections import Counter
2
+
3
+ import click
4
+
5
+ from .astra import astra_dest_cmd
6
+ from .azure_cognitive_search import azure_cognitive_search_dest_cmd
7
+ from .chroma import chroma_dest_cmd
8
+ from .databricks_volumes import databricks_volumes_dest_cmd
9
+ from .elasticsearch import elasticsearch_dest_cmd, elasticsearch_src_cmd
10
+ from .fsspec.azure import azure_dest_cmd, azure_src_cmd
11
+ from .fsspec.box import box_dest_cmd, box_src_cmd
12
+ from .fsspec.dropbox import dropbox_dest_cmd, dropbox_src_cmd
13
+ from .fsspec.gcs import gcs_dest_cmd, gcs_src_cmd
14
+ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
15
+ from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
16
+ from .google_drive import google_drive_src_cmd
17
+ from .local import local_dest_cmd, local_src_cmd
18
+ from .mongodb import mongodb_dest_cmd
19
+ from .onedrive import onedrive_drive_src_cmd
20
+ from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
21
+ from .pinecone import pinecone_dest_cmd
22
+ from .salesforce import salesforce_src_cmd
23
+ from .sharepoint import sharepoint_drive_src_cmd
24
+ from .singlestore import singlestore_dest_cmd
25
+ from .sql import sql_dest_cmd
26
+ from .weaviate import weaviate_dest_cmd
27
+
28
+ src_cmds = [
29
+ azure_src_cmd,
30
+ box_src_cmd,
31
+ dropbox_src_cmd,
32
+ elasticsearch_src_cmd,
33
+ gcs_src_cmd,
34
+ google_drive_src_cmd,
35
+ local_src_cmd,
36
+ onedrive_drive_src_cmd,
37
+ opensearch_src_cmd,
38
+ s3_src_cmd,
39
+ salesforce_src_cmd,
40
+ sharepoint_drive_src_cmd,
41
+ sftp_src_cmd,
42
+ ]
43
+ duplicate_src_names = [
44
+ name for name, count in Counter([s.cmd_name for s in src_cmds]).items() if count > 1
45
+ ]
46
+ if duplicate_src_names:
47
+ raise ValueError(
48
+ "the following source cmd names were reused, all must be unique: {}".format(
49
+ ", ".join(duplicate_src_names)
50
+ )
51
+ )
52
+
53
+ dest_cmds = [
54
+ astra_dest_cmd,
55
+ azure_cognitive_search_dest_cmd,
56
+ azure_dest_cmd,
57
+ box_dest_cmd,
58
+ chroma_dest_cmd,
59
+ dropbox_dest_cmd,
60
+ elasticsearch_dest_cmd,
61
+ gcs_dest_cmd,
62
+ local_dest_cmd,
63
+ opensearch_dest_cmd,
64
+ pinecone_dest_cmd,
65
+ s3_dest_cmd,
66
+ sftp_dest_cmd,
67
+ singlestore_dest_cmd,
68
+ weaviate_dest_cmd,
69
+ mongodb_dest_cmd,
70
+ databricks_volumes_dest_cmd,
71
+ sql_dest_cmd,
72
+ ]
73
+
74
+ duplicate_dest_names = [
75
+ name for name, count in Counter([d.cmd_name for d in dest_cmds]).items() if count > 1
76
+ ]
77
+ if duplicate_dest_names:
78
+ raise ValueError(
79
+ "the following dest cmd names were reused, all must be unique: {}".format(
80
+ ", ".join(duplicate_dest_names)
81
+ )
82
+ )
83
+
84
+
85
+ src: list[click.Group] = [v.get_cmd() for v in src_cmds]
86
+
87
+ dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
@@ -0,0 +1,85 @@
1
+ from dataclasses import dataclass
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.v2.cli.base import DestCmd
6
+ from unstructured_ingest.v2.cli.interfaces import CliConfig
7
+ from unstructured_ingest.v2.cli.utils import Dict
8
+ from unstructured_ingest.v2.processes.connectors.astra import CONNECTOR_TYPE
9
+
10
+
11
+ @dataclass
12
+ class AstraCliConnectionConfig(CliConfig):
13
+ @staticmethod
14
+ def get_cli_options() -> list[click.Option]:
15
+ options = [
16
+ click.Option(
17
+ ["--token"],
18
+ required=True,
19
+ type=str,
20
+ help="Astra DB Token with access to the database.",
21
+ envvar="ASTRA_DB_TOKEN",
22
+ show_envvar=True,
23
+ ),
24
+ click.Option(
25
+ ["--api-endpoint"],
26
+ required=True,
27
+ type=str,
28
+ help="The API endpoint for the Astra DB.",
29
+ envvar="ASTRA_DB_ENDPOINT",
30
+ show_envvar=True,
31
+ ),
32
+ ]
33
+ return options
34
+
35
+
36
+ @dataclass
37
+ class AstraCliUploaderConfig(CliConfig):
38
+ @staticmethod
39
+ def get_cli_options() -> list[click.Option]:
40
+ options = [
41
+ click.Option(
42
+ ["--collection-name"],
43
+ required=False,
44
+ type=str,
45
+ help="The name of the Astra DB collection. "
46
+ "Note that the collection name must only include letters, "
47
+ "numbers, and underscores.",
48
+ ),
49
+ click.Option(
50
+ ["--embedding-dimension"],
51
+ required=True,
52
+ default=384,
53
+ type=int,
54
+ help="The dimensionality of the embeddings",
55
+ ),
56
+ click.Option(
57
+ ["--namespace"],
58
+ required=False,
59
+ default=None,
60
+ type=str,
61
+ help="The Astra DB connection namespace.",
62
+ ),
63
+ click.Option(
64
+ ["--requested-indexing-policy"],
65
+ required=False,
66
+ default=None,
67
+ type=Dict(),
68
+ help="The indexing policy to use for the collection."
69
+ 'example: \'{"deny": ["metadata"]}\' ',
70
+ ),
71
+ click.Option(
72
+ ["--batch-size"],
73
+ default=20,
74
+ type=int,
75
+ help="Number of records per batch",
76
+ ),
77
+ ]
78
+ return options
79
+
80
+
81
+ astra_dest_cmd = DestCmd(
82
+ cmd_name=CONNECTOR_TYPE,
83
+ connection_config=AstraCliConnectionConfig,
84
+ uploader_config=AstraCliUploaderConfig,
85
+ )