unstructured-ingest 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (356) hide show
  1. unstructured_ingest/__init__.py +1 -0
  2. unstructured_ingest/__version__.py +1 -0
  3. unstructured_ingest/cli/__init__.py +14 -0
  4. unstructured_ingest/cli/base/__init__.py +0 -0
  5. unstructured_ingest/cli/base/cmd.py +19 -0
  6. unstructured_ingest/cli/base/dest.py +87 -0
  7. unstructured_ingest/cli/base/src.py +57 -0
  8. unstructured_ingest/cli/cli.py +32 -0
  9. unstructured_ingest/cli/cmd_factory.py +12 -0
  10. unstructured_ingest/cli/cmds/__init__.py +145 -0
  11. unstructured_ingest/cli/cmds/airtable.py +69 -0
  12. unstructured_ingest/cli/cmds/astra.py +99 -0
  13. unstructured_ingest/cli/cmds/azure_cognitive_search.py +65 -0
  14. unstructured_ingest/cli/cmds/biomed.py +52 -0
  15. unstructured_ingest/cli/cmds/chroma.py +104 -0
  16. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  17. unstructured_ingest/cli/cmds/confluence.py +69 -0
  18. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  19. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  20. unstructured_ingest/cli/cmds/discord.py +47 -0
  21. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  22. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  23. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  24. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  25. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  26. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  27. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  28. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  29. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  30. unstructured_ingest/cli/cmds/github.py +54 -0
  31. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  32. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  33. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  34. unstructured_ingest/cli/cmds/jira.py +71 -0
  35. unstructured_ingest/cli/cmds/kafka.py +102 -0
  36. unstructured_ingest/cli/cmds/local.py +43 -0
  37. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  38. unstructured_ingest/cli/cmds/notion.py +48 -0
  39. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  40. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  41. unstructured_ingest/cli/cmds/outlook.py +67 -0
  42. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  43. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  44. unstructured_ingest/cli/cmds/reddit.py +67 -0
  45. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  46. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  47. unstructured_ingest/cli/cmds/slack.py +56 -0
  48. unstructured_ingest/cli/cmds/sql.py +66 -0
  49. unstructured_ingest/cli/cmds/vectara.py +66 -0
  50. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  51. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  52. unstructured_ingest/cli/common.py +7 -0
  53. unstructured_ingest/cli/interfaces.py +656 -0
  54. unstructured_ingest/cli/utils.py +205 -0
  55. unstructured_ingest/connector/__init__.py +0 -0
  56. unstructured_ingest/connector/airtable.py +309 -0
  57. unstructured_ingest/connector/astra.py +237 -0
  58. unstructured_ingest/connector/azure_cognitive_search.py +144 -0
  59. unstructured_ingest/connector/biomed.py +313 -0
  60. unstructured_ingest/connector/chroma.py +158 -0
  61. unstructured_ingest/connector/clarifai.py +122 -0
  62. unstructured_ingest/connector/confluence.py +285 -0
  63. unstructured_ingest/connector/databricks_volumes.py +137 -0
  64. unstructured_ingest/connector/delta_table.py +203 -0
  65. unstructured_ingest/connector/discord.py +180 -0
  66. unstructured_ingest/connector/elasticsearch.py +396 -0
  67. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  68. unstructured_ingest/connector/fsspec/azure.py +78 -0
  69. unstructured_ingest/connector/fsspec/box.py +109 -0
  70. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  71. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  72. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  73. unstructured_ingest/connector/fsspec/s3.py +62 -0
  74. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  75. unstructured_ingest/connector/git.py +124 -0
  76. unstructured_ingest/connector/github.py +173 -0
  77. unstructured_ingest/connector/gitlab.py +142 -0
  78. unstructured_ingest/connector/google_drive.py +349 -0
  79. unstructured_ingest/connector/hubspot.py +278 -0
  80. unstructured_ingest/connector/jira.py +469 -0
  81. unstructured_ingest/connector/kafka.py +294 -0
  82. unstructured_ingest/connector/local.py +139 -0
  83. unstructured_ingest/connector/mongodb.py +285 -0
  84. unstructured_ingest/connector/notion/__init__.py +0 -0
  85. unstructured_ingest/connector/notion/client.py +233 -0
  86. unstructured_ingest/connector/notion/connector.py +468 -0
  87. unstructured_ingest/connector/notion/helpers.py +584 -0
  88. unstructured_ingest/connector/notion/interfaces.py +32 -0
  89. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  90. unstructured_ingest/connector/notion/types/block.py +95 -0
  91. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  92. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  93. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  94. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  95. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  96. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  97. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  98. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  99. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  100. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  101. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  102. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  103. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  104. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  105. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  106. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  107. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  108. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  109. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  110. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  111. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  112. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  113. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  114. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  115. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  116. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  117. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  118. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  119. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  120. unstructured_ingest/connector/notion/types/database.py +72 -0
  121. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  122. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  123. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  124. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  125. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  126. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  127. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  128. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  129. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  130. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  131. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  132. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  133. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  134. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  135. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  136. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  137. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  138. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  139. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  140. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  141. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  142. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  143. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  144. unstructured_ingest/connector/notion/types/date.py +26 -0
  145. unstructured_ingest/connector/notion/types/file.py +51 -0
  146. unstructured_ingest/connector/notion/types/page.py +44 -0
  147. unstructured_ingest/connector/notion/types/parent.py +66 -0
  148. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  149. unstructured_ingest/connector/notion/types/user.py +76 -0
  150. unstructured_ingest/connector/onedrive.py +232 -0
  151. unstructured_ingest/connector/opensearch.py +218 -0
  152. unstructured_ingest/connector/outlook.py +285 -0
  153. unstructured_ingest/connector/pinecone.py +140 -0
  154. unstructured_ingest/connector/qdrant.py +144 -0
  155. unstructured_ingest/connector/reddit.py +166 -0
  156. unstructured_ingest/connector/registry.py +109 -0
  157. unstructured_ingest/connector/salesforce.py +301 -0
  158. unstructured_ingest/connector/sharepoint.py +573 -0
  159. unstructured_ingest/connector/slack.py +224 -0
  160. unstructured_ingest/connector/sql.py +199 -0
  161. unstructured_ingest/connector/vectara.py +248 -0
  162. unstructured_ingest/connector/weaviate.py +190 -0
  163. unstructured_ingest/connector/wikipedia.py +208 -0
  164. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  165. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  166. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  167. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  168. unstructured_ingest/error.py +49 -0
  169. unstructured_ingest/evaluate.py +338 -0
  170. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  171. unstructured_ingest/ingest_backoff/_common.py +102 -0
  172. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  173. unstructured_ingest/interfaces.py +838 -0
  174. unstructured_ingest/logger.py +130 -0
  175. unstructured_ingest/main.py +11 -0
  176. unstructured_ingest/pipeline/__init__.py +22 -0
  177. unstructured_ingest/pipeline/copy.py +19 -0
  178. unstructured_ingest/pipeline/doc_factory.py +12 -0
  179. unstructured_ingest/pipeline/interfaces.py +265 -0
  180. unstructured_ingest/pipeline/partition.py +60 -0
  181. unstructured_ingest/pipeline/permissions.py +12 -0
  182. unstructured_ingest/pipeline/pipeline.py +117 -0
  183. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  184. unstructured_ingest/pipeline/reformat/chunking.py +130 -0
  185. unstructured_ingest/pipeline/reformat/embedding.py +66 -0
  186. unstructured_ingest/pipeline/source.py +77 -0
  187. unstructured_ingest/pipeline/utils.py +6 -0
  188. unstructured_ingest/pipeline/write.py +18 -0
  189. unstructured_ingest/processor.py +93 -0
  190. unstructured_ingest/runner/__init__.py +104 -0
  191. unstructured_ingest/runner/airtable.py +35 -0
  192. unstructured_ingest/runner/astra.py +34 -0
  193. unstructured_ingest/runner/base_runner.py +89 -0
  194. unstructured_ingest/runner/biomed.py +45 -0
  195. unstructured_ingest/runner/confluence.py +35 -0
  196. unstructured_ingest/runner/delta_table.py +34 -0
  197. unstructured_ingest/runner/discord.py +35 -0
  198. unstructured_ingest/runner/elasticsearch.py +40 -0
  199. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  200. unstructured_ingest/runner/fsspec/azure.py +30 -0
  201. unstructured_ingest/runner/fsspec/box.py +28 -0
  202. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  203. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  204. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  205. unstructured_ingest/runner/fsspec/s3.py +28 -0
  206. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  207. unstructured_ingest/runner/github.py +37 -0
  208. unstructured_ingest/runner/gitlab.py +37 -0
  209. unstructured_ingest/runner/google_drive.py +35 -0
  210. unstructured_ingest/runner/hubspot.py +35 -0
  211. unstructured_ingest/runner/jira.py +35 -0
  212. unstructured_ingest/runner/kafka.py +34 -0
  213. unstructured_ingest/runner/local.py +23 -0
  214. unstructured_ingest/runner/mongodb.py +34 -0
  215. unstructured_ingest/runner/notion.py +61 -0
  216. unstructured_ingest/runner/onedrive.py +35 -0
  217. unstructured_ingest/runner/opensearch.py +40 -0
  218. unstructured_ingest/runner/outlook.py +33 -0
  219. unstructured_ingest/runner/reddit.py +35 -0
  220. unstructured_ingest/runner/salesforce.py +33 -0
  221. unstructured_ingest/runner/sharepoint.py +35 -0
  222. unstructured_ingest/runner/slack.py +33 -0
  223. unstructured_ingest/runner/utils.py +47 -0
  224. unstructured_ingest/runner/wikipedia.py +35 -0
  225. unstructured_ingest/runner/writers/__init__.py +48 -0
  226. unstructured_ingest/runner/writers/astra.py +22 -0
  227. unstructured_ingest/runner/writers/azure_cognitive_search.py +24 -0
  228. unstructured_ingest/runner/writers/base_writer.py +26 -0
  229. unstructured_ingest/runner/writers/chroma.py +22 -0
  230. unstructured_ingest/runner/writers/clarifai.py +19 -0
  231. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  232. unstructured_ingest/runner/writers/delta_table.py +24 -0
  233. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  234. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  235. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  236. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  237. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  238. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  239. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  240. unstructured_ingest/runner/writers/kafka.py +21 -0
  241. unstructured_ingest/runner/writers/mongodb.py +21 -0
  242. unstructured_ingest/runner/writers/opensearch.py +26 -0
  243. unstructured_ingest/runner/writers/pinecone.py +21 -0
  244. unstructured_ingest/runner/writers/qdrant.py +19 -0
  245. unstructured_ingest/runner/writers/sql.py +22 -0
  246. unstructured_ingest/runner/writers/vectara.py +22 -0
  247. unstructured_ingest/runner/writers/weaviate.py +21 -0
  248. unstructured_ingest/utils/__init__.py +0 -0
  249. unstructured_ingest/utils/compression.py +117 -0
  250. unstructured_ingest/utils/data_prep.py +112 -0
  251. unstructured_ingest/utils/dep_check.py +66 -0
  252. unstructured_ingest/utils/string_and_date_utils.py +39 -0
  253. unstructured_ingest/utils/table.py +73 -0
  254. unstructured_ingest/v2/__init__.py +1 -0
  255. unstructured_ingest/v2/cli/__init__.py +0 -0
  256. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  257. unstructured_ingest/v2/cli/base/cmd.py +215 -0
  258. unstructured_ingest/v2/cli/base/dest.py +76 -0
  259. unstructured_ingest/v2/cli/base/importer.py +34 -0
  260. unstructured_ingest/v2/cli/base/src.py +70 -0
  261. unstructured_ingest/v2/cli/cli.py +24 -0
  262. unstructured_ingest/v2/cli/cmds/__init__.py +87 -0
  263. unstructured_ingest/v2/cli/cmds/astra.py +85 -0
  264. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +72 -0
  265. unstructured_ingest/v2/cli/cmds/chroma.py +108 -0
  266. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +161 -0
  267. unstructured_ingest/v2/cli/cmds/elasticsearch.py +159 -0
  268. unstructured_ingest/v2/cli/cmds/fsspec/__init__.py +0 -0
  269. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +84 -0
  270. unstructured_ingest/v2/cli/cmds/fsspec/box.py +58 -0
  271. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +58 -0
  272. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +77 -0
  273. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +81 -0
  274. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +84 -0
  275. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +80 -0
  276. unstructured_ingest/v2/cli/cmds/google_drive.py +74 -0
  277. unstructured_ingest/v2/cli/cmds/local.py +60 -0
  278. unstructured_ingest/v2/cli/cmds/mongodb.py +62 -0
  279. unstructured_ingest/v2/cli/cmds/onedrive.py +91 -0
  280. unstructured_ingest/v2/cli/cmds/opensearch.py +93 -0
  281. unstructured_ingest/v2/cli/cmds/pinecone.py +62 -0
  282. unstructured_ingest/v2/cli/cmds/salesforce.py +79 -0
  283. unstructured_ingest/v2/cli/cmds/sharepoint.py +112 -0
  284. unstructured_ingest/v2/cli/cmds/singlestore.py +96 -0
  285. unstructured_ingest/v2/cli/cmds/sql.py +84 -0
  286. unstructured_ingest/v2/cli/cmds/weaviate.py +100 -0
  287. unstructured_ingest/v2/cli/configs/__init__.py +6 -0
  288. unstructured_ingest/v2/cli/configs/chunk.py +89 -0
  289. unstructured_ingest/v2/cli/configs/embed.py +74 -0
  290. unstructured_ingest/v2/cli/configs/partition.py +99 -0
  291. unstructured_ingest/v2/cli/configs/processor.py +88 -0
  292. unstructured_ingest/v2/cli/interfaces.py +27 -0
  293. unstructured_ingest/v2/cli/utils.py +240 -0
  294. unstructured_ingest/v2/example.py +37 -0
  295. unstructured_ingest/v2/interfaces/__init__.py +29 -0
  296. unstructured_ingest/v2/interfaces/connector.py +32 -0
  297. unstructured_ingest/v2/interfaces/downloader.py +79 -0
  298. unstructured_ingest/v2/interfaces/file_data.py +49 -0
  299. unstructured_ingest/v2/interfaces/indexer.py +28 -0
  300. unstructured_ingest/v2/interfaces/process.py +20 -0
  301. unstructured_ingest/v2/interfaces/processor.py +48 -0
  302. unstructured_ingest/v2/interfaces/upload_stager.py +48 -0
  303. unstructured_ingest/v2/interfaces/uploader.py +39 -0
  304. unstructured_ingest/v2/logger.py +126 -0
  305. unstructured_ingest/v2/main.py +11 -0
  306. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  307. unstructured_ingest/v2/pipeline/interfaces.py +167 -0
  308. unstructured_ingest/v2/pipeline/pipeline.py +284 -0
  309. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  310. unstructured_ingest/v2/pipeline/steps/chunk.py +85 -0
  311. unstructured_ingest/v2/pipeline/steps/download.py +124 -0
  312. unstructured_ingest/v2/pipeline/steps/embed.py +84 -0
  313. unstructured_ingest/v2/pipeline/steps/index.py +61 -0
  314. unstructured_ingest/v2/pipeline/steps/partition.py +78 -0
  315. unstructured_ingest/v2/pipeline/steps/stage.py +64 -0
  316. unstructured_ingest/v2/pipeline/steps/uncompress.py +68 -0
  317. unstructured_ingest/v2/pipeline/steps/upload.py +73 -0
  318. unstructured_ingest/v2/pipeline/utils.py +15 -0
  319. unstructured_ingest/v2/processes/__init__.py +0 -0
  320. unstructured_ingest/v2/processes/chunker.py +97 -0
  321. unstructured_ingest/v2/processes/connector_registry.py +63 -0
  322. unstructured_ingest/v2/processes/connectors/__init__.py +77 -0
  323. unstructured_ingest/v2/processes/connectors/astra.py +152 -0
  324. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +211 -0
  325. unstructured_ingest/v2/processes/connectors/chroma.py +204 -0
  326. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +96 -0
  327. unstructured_ingest/v2/processes/connectors/elasticsearch.py +401 -0
  328. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  329. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +144 -0
  330. unstructured_ingest/v2/processes/connectors/fsspec/box.py +131 -0
  331. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +130 -0
  332. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +342 -0
  333. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +141 -0
  334. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +164 -0
  335. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +166 -0
  336. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  337. unstructured_ingest/v2/processes/connectors/google_drive.py +335 -0
  338. unstructured_ingest/v2/processes/connectors/local.py +204 -0
  339. unstructured_ingest/v2/processes/connectors/mongodb.py +138 -0
  340. unstructured_ingest/v2/processes/connectors/onedrive.py +216 -0
  341. unstructured_ingest/v2/processes/connectors/opensearch.py +155 -0
  342. unstructured_ingest/v2/processes/connectors/pinecone.py +178 -0
  343. unstructured_ingest/v2/processes/connectors/salesforce.py +293 -0
  344. unstructured_ingest/v2/processes/connectors/sharepoint.py +412 -0
  345. unstructured_ingest/v2/processes/connectors/singlestore.py +160 -0
  346. unstructured_ingest/v2/processes/connectors/sql.py +269 -0
  347. unstructured_ingest/v2/processes/connectors/utils.py +19 -0
  348. unstructured_ingest/v2/processes/connectors/weaviate.py +235 -0
  349. unstructured_ingest/v2/processes/embedder.py +76 -0
  350. unstructured_ingest/v2/processes/partitioner.py +166 -0
  351. unstructured_ingest/v2/processes/uncompress.py +43 -0
  352. unstructured_ingest-0.0.0.dist-info/METADATA +319 -0
  353. unstructured_ingest-0.0.0.dist-info/RECORD +356 -0
  354. unstructured_ingest-0.0.0.dist-info/WHEEL +5 -0
  355. unstructured_ingest-0.0.0.dist-info/entry_points.txt +2 -0
  356. unstructured_ingest-0.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -0,0 +1 @@
1
+ __version__ = "0.0.0" # pragma: no cover
@@ -0,0 +1,14 @@
1
+ import typing as t
2
+
3
+ import click
4
+
5
+ from unstructured_ingest.cli.cmds import base_dest_cmd_fns, base_src_cmd_fns
6
+
7
+ src: t.List[click.Group] = [v().get_src_cmd() for v in base_src_cmd_fns]
8
+
9
+ dest: t.List[click.Command] = [v().get_dest_cmd() for v in base_dest_cmd_fns]
10
+
11
+ __all__ = [
12
+ "src",
13
+ "dest",
14
+ ]
File without changes
@@ -0,0 +1,19 @@
1
+ import typing as t
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+
5
+ from unstructured_ingest.cli.interfaces import CliConfig
6
+ from unstructured_ingest.interfaces import BaseConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseCmd(ABC):
11
+ cmd_name: str
12
+ cli_config: t.Optional[t.Type[BaseConfig]] = None
13
+ additional_cli_options: t.List[t.Type[CliConfig]] = field(default_factory=list)
14
+ addition_configs: t.Dict[str, t.Type[BaseConfig]] = field(default_factory=dict)
15
+ is_fsspec: bool = False
16
+
17
+ @property
18
+ def cmd_name_key(self):
19
+ return self.cmd_name.replace("-", "_")
@@ -0,0 +1,87 @@
1
+ import logging
2
+ import typing as t
3
+ from dataclasses import dataclass
4
+
5
+ import click
6
+
7
+ from unstructured_ingest.cli.base.cmd import BaseCmd
8
+ from unstructured_ingest.cli.cmd_factory import get_src_cmd
9
+ from unstructured_ingest.cli.common import (
10
+ log_options,
11
+ )
12
+ from unstructured_ingest.cli.interfaces import BaseConfig, CliFilesStorageConfig
13
+ from unstructured_ingest.cli.utils import (
14
+ add_options,
15
+ conform_click_options,
16
+ extract_config,
17
+ extract_configs,
18
+ )
19
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
20
+ from unstructured_ingest.runner.writers import writer_map
21
+
22
+
23
+ @dataclass
24
+ class BaseDestCmd(BaseCmd):
25
+ write_config: t.Optional[t.Type[BaseConfig]] = None
26
+
27
+ def get_dest_runner(self, source_cmd: str, options: dict, parent_options: dict):
28
+ src_cmd_fn = get_src_cmd(cmd_name=source_cmd)
29
+ src_cmd = src_cmd_fn()
30
+ runner = src_cmd.get_source_runner(options=parent_options)
31
+ addition_configs = self.addition_configs
32
+ if "connector_config" not in addition_configs:
33
+ addition_configs["connector_config"] = self.cli_config
34
+ if self.write_config:
35
+ addition_configs["write_config"] = self.write_config
36
+ configs = extract_configs(
37
+ options,
38
+ validate=[self.cli_config] if self.cli_config else None,
39
+ extras=addition_configs,
40
+ add_defaults=False,
41
+ )
42
+ writer_cls = writer_map[self.cmd_name_key]
43
+ writer = writer_cls(**configs) # type: ignore
44
+ runner.writer = writer
45
+ runner.writer_kwargs = options
46
+ return runner
47
+
48
+ def check_dest_options(self, options: dict):
49
+ extract_config(flat_data=options, config=self.cli_config)
50
+
51
+ def dest(self, ctx: click.Context, **options):
52
+ if not ctx.parent:
53
+ raise click.ClickException("destination command called without a parent")
54
+ if not ctx.parent.info_name:
55
+ raise click.ClickException("parent command missing info name")
56
+ source_cmd = ctx.parent.info_name.replace("-", "_")
57
+ parent_options: dict = ctx.parent.params if ctx.parent else {}
58
+ conform_click_options(options)
59
+ verbose = parent_options.get("verbose", False)
60
+ ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
61
+ log_options(parent_options, verbose=verbose)
62
+ log_options(options, verbose=verbose)
63
+ try:
64
+ self.check_dest_options(options=options)
65
+ runner = self.get_dest_runner(
66
+ source_cmd=source_cmd,
67
+ options=options,
68
+ parent_options=parent_options,
69
+ )
70
+ runner.run(**parent_options)
71
+ except Exception as e:
72
+ logger.error(e, exc_info=True)
73
+ raise click.ClickException(str(e)) from e
74
+
75
+ def get_dest_cmd(self) -> click.Command:
76
+ # Dynamically create the command without the use of click decorators
77
+ fn = self.dest
78
+ fn = click.pass_context(fn)
79
+ cmd: click.Group = click.command(fn)
80
+ cmd.name = self.cmd_name
81
+ cmd.invoke_without_command = True
82
+ options = [self.cli_config] if self.cli_config else []
83
+ options += self.additional_cli_options
84
+ if self.is_fsspec and CliFilesStorageConfig not in options:
85
+ options.append(CliFilesStorageConfig)
86
+ add_options(cmd, extras=options, is_src=False)
87
+ return cmd
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.cmd import BaseCmd
7
+ from unstructured_ingest.cli.common import (
8
+ log_options,
9
+ )
10
+ from unstructured_ingest.cli.interfaces import CliFilesStorageConfig
11
+ from unstructured_ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
12
+ from unstructured_ingest.logger import ingest_log_streaming_init, logger
13
+ from unstructured_ingest.runner import runner_map
14
+
15
+
16
+ @dataclass
17
+ class BaseSrcCmd(BaseCmd):
18
+ def get_source_runner(self, options: dict):
19
+ addition_configs = self.addition_configs
20
+ if "connector_config" not in addition_configs:
21
+ addition_configs["connector_config"] = self.cli_config
22
+ configs = extract_configs(
23
+ options,
24
+ validate=[self.cli_config] if self.cli_config else None,
25
+ extras=addition_configs,
26
+ )
27
+ runner = runner_map[self.cmd_name_key]
28
+ return runner(**configs) # type: ignore
29
+
30
+ def src(self, ctx: click.Context, **options):
31
+ if ctx.invoked_subcommand:
32
+ return
33
+
34
+ conform_click_options(options)
35
+ verbose = options.get("verbose", False)
36
+ ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
37
+ log_options(options, verbose=verbose)
38
+ try:
39
+ runner = self.get_source_runner(options=options)
40
+ runner.run(**options)
41
+ except Exception as e:
42
+ logger.error(e, exc_info=True)
43
+ raise click.ClickException(str(e)) from e
44
+
45
+ def get_src_cmd(self) -> click.Group:
46
+ # Dynamically create the command without the use of click decorators
47
+ fn = self.src
48
+ fn = click.pass_context(fn)
49
+ cmd: click.Group = click.group(fn, cls=Group)
50
+ cmd.name = self.cmd_name
51
+ cmd.invoke_without_command = True
52
+ extra_options = [self.cli_config] if self.cli_config else []
53
+ extra_options += self.additional_cli_options
54
+ if self.is_fsspec and CliFilesStorageConfig not in extra_options:
55
+ extra_options.append(CliFilesStorageConfig)
56
+ add_options(cmd, extras=extra_options)
57
+ return cmd
@@ -0,0 +1,32 @@
1
+ import click
2
+
3
+ from unstructured_ingest.cli import dest, src
4
+ from unstructured_ingest.v2.cli.cmds import dest as dest_v2
5
+ from unstructured_ingest.v2.cli.cmds import src as src_v2
6
+
7
+
8
+ @click.group()
9
+ def ingest():
10
+ pass
11
+
12
+
13
+ def get_cmd() -> click.Command:
14
+ """Construct and return a Click command object representing the main command for the CLI.
15
+
16
+ This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
17
+ to the main command as nested subcommands.
18
+ """
19
+ cmd = ingest
20
+ src_dict = {s.name: s for s in src}
21
+ dest_dict = {d.name: d for d in dest}
22
+ for s in src_v2:
23
+ src_dict[s.name] = s
24
+ for d in dest_v2:
25
+ dest_dict[d.name] = d
26
+ # Add all subcommands
27
+ for src_subcommand in src_dict.values():
28
+ # Add all destination subcommands
29
+ for dest_subcommand in dest_dict.values():
30
+ src_subcommand.add_command(dest_subcommand)
31
+ cmd.add_command(src_subcommand)
32
+ return cmd
@@ -0,0 +1,12 @@
1
+ import typing as t
2
+
3
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
4
+ from unstructured_ingest.cli.cmds import base_src_cmd_fns
5
+
6
+
7
+ def get_src_cmd_map() -> t.Dict[str, t.Callable[[], BaseSrcCmd]]:
8
+ return {b().cmd_name_key: b for b in base_src_cmd_fns}
9
+
10
+
11
+ def get_src_cmd(cmd_name: str) -> t.Callable[[], BaseSrcCmd]:
12
+ return get_src_cmd_map()[cmd_name]
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ import collections
4
+ import typing as t
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
8
+
9
+ from .airtable import get_base_src_cmd as airtable_base_src_cmd
10
+ from .astra import get_base_dest_cmd as astra_base_dest_cmd
11
+ from .astra import get_base_src_cmd as astra_base_src_cmd
12
+ from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
13
+ from .biomed import get_base_src_cmd as biomed_base_src_cmd
14
+ from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
15
+ from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd
16
+ from .confluence import get_base_src_cmd as confluence_base_src_cmd
17
+ from .databricks_volumes import get_base_dest_cmd as databricks_volumes_dest_cmd
18
+ from .delta_table import get_base_dest_cmd as delta_table_dest_cmd
19
+ from .delta_table import get_base_src_cmd as delta_table_base_src_cmd
20
+ from .discord import get_base_src_cmd as discord_base_src_cmd
21
+ from .elasticsearch import get_base_dest_cmd as elasticsearch_base_dest_cmd
22
+ from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd
23
+ from .fsspec.azure import get_base_dest_cmd as azure_base_dest_cmd
24
+ from .fsspec.azure import get_base_src_cmd as azure_base_src_cmd
25
+ from .fsspec.box import get_base_dest_cmd as box_base_dest_cmd
26
+ from .fsspec.box import get_base_src_cmd as box_base_src_cmd
27
+ from .fsspec.dropbox import get_base_dest_cmd as dropbox_base_dest_cmd
28
+ from .fsspec.dropbox import get_base_src_cmd as dropbox_base_src_cmd
29
+ from .fsspec.fsspec import get_base_dest_cmd as fsspec_base_dest_cmd
30
+ from .fsspec.fsspec import get_base_src_cmd as fsspec_base_src_cmd
31
+ from .fsspec.gcs import get_base_dest_cmd as gcs_base_dest_cmd
32
+ from .fsspec.gcs import get_base_src_cmd as gcs_base_src_cmd
33
+ from .fsspec.s3 import get_base_dest_cmd as s3_base_dest_cmd
34
+ from .fsspec.s3 import get_base_src_cmd as s3_base_src_cmd
35
+ from .github import get_base_src_cmd as github_base_src_cmd
36
+ from .gitlab import get_base_src_cmd as gitlab_base_src_cmd
37
+ from .google_drive import get_base_src_cmd as google_drive_base_src_cmd
38
+ from .hubspot import get_base_src_cmd as hubspot_base_src_cmd
39
+ from .jira import get_base_src_cmd as jira_base_src_cmd
40
+ from .kafka import get_base_dest_cmd as kafka_base_dest_cmd
41
+ from .kafka import get_base_src_cmd as kafka_base_src_cmd
42
+ from .local import get_base_src_cmd as local_base_src_cmd
43
+ from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
44
+ from .mongodb import get_base_src_cmd as mongodb_base_src_cmd
45
+ from .notion import get_base_src_cmd as notion_base_src_cmd
46
+ from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
47
+ from .opensearch import get_base_dest_cmd as opensearch_base_dest_cmd
48
+ from .opensearch import get_base_src_cmd as opensearch_base_src_cmd
49
+ from .outlook import get_base_src_cmd as outlook_base_src_cmd
50
+ from .pinecone import get_base_dest_cmd as pinecone_base_dest_cmd
51
+ from .qdrant import get_base_dest_cmd as qdrant_base_dest_cmd
52
+ from .reddit import get_base_src_cmd as reddit_base_src_cmd
53
+ from .salesforce import get_base_src_cmd as salesforce_base_src_cmd
54
+ from .sharepoint import get_base_src_cmd as sharepoint_base_src_cmd
55
+ from .slack import get_base_src_cmd as slack_base_src_cmd
56
+ from .sql import get_base_dest_cmd as sql_base_dest_cmd
57
+ from .vectara import get_base_dest_cmd as vectara_base_dest_cmd
58
+ from .weaviate import get_base_dest_cmd as weaviate_dest_cmd
59
+ from .wikipedia import get_base_src_cmd as wikipedia_base_src_cmd
60
+
61
+ if t.TYPE_CHECKING:
62
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
63
+
64
+ base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
65
+ airtable_base_src_cmd,
66
+ astra_base_src_cmd,
67
+ azure_base_src_cmd,
68
+ biomed_base_src_cmd,
69
+ box_base_src_cmd,
70
+ confluence_base_src_cmd,
71
+ delta_table_base_src_cmd,
72
+ discord_base_src_cmd,
73
+ dropbox_base_src_cmd,
74
+ elasticsearch_base_src_cmd,
75
+ fsspec_base_src_cmd,
76
+ gcs_base_src_cmd,
77
+ github_base_src_cmd,
78
+ gitlab_base_src_cmd,
79
+ google_drive_base_src_cmd,
80
+ hubspot_base_src_cmd,
81
+ jira_base_src_cmd,
82
+ kafka_base_src_cmd,
83
+ local_base_src_cmd,
84
+ mongodb_base_src_cmd,
85
+ notion_base_src_cmd,
86
+ onedrive_base_src_cmd,
87
+ opensearch_base_src_cmd,
88
+ outlook_base_src_cmd,
89
+ reddit_base_src_cmd,
90
+ salesforce_base_src_cmd,
91
+ sftp_base_src_cmd,
92
+ sharepoint_base_src_cmd,
93
+ slack_base_src_cmd,
94
+ s3_base_src_cmd,
95
+ wikipedia_base_src_cmd,
96
+ ]
97
+
98
+ # Make sure there are not overlapping names
99
+ src_cmd_names = [b().cmd_name for b in base_src_cmd_fns]
100
+ src_duplicates = [item for item, count in collections.Counter(src_cmd_names).items() if count > 1]
101
+ if src_duplicates:
102
+ raise ValueError(
103
+ "multiple base src commands defined with the same names: {}".format(
104
+ ", ".join(src_duplicates),
105
+ ),
106
+ )
107
+
108
+ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
109
+ astra_base_dest_cmd,
110
+ azure_base_dest_cmd,
111
+ box_base_dest_cmd,
112
+ chroma_base_dest_cmd,
113
+ clarifai_base_dest_cmd,
114
+ databricks_volumes_dest_cmd,
115
+ dropbox_base_dest_cmd,
116
+ elasticsearch_base_dest_cmd,
117
+ fsspec_base_dest_cmd,
118
+ gcs_base_dest_cmd,
119
+ kafka_base_dest_cmd,
120
+ s3_base_dest_cmd,
121
+ azure_cognitive_search_base_dest_cmd,
122
+ delta_table_dest_cmd,
123
+ sql_base_dest_cmd,
124
+ weaviate_dest_cmd,
125
+ mongo_base_dest_cmd,
126
+ pinecone_base_dest_cmd,
127
+ qdrant_base_dest_cmd,
128
+ opensearch_base_dest_cmd,
129
+ vectara_base_dest_cmd,
130
+ ]
131
+
132
+ # Make sure there are not overlapping names
133
+ dest_cmd_names = [b().cmd_name for b in base_dest_cmd_fns]
134
+ dest_duplicates = [item for item, count in collections.Counter(dest_cmd_names).items() if count > 1]
135
+ if dest_duplicates:
136
+ raise ValueError(
137
+ "multiple base dest commands defined with the same names: {}".format(
138
+ ", ".join(dest_duplicates),
139
+ ),
140
+ )
141
+
142
+ __all__ = [
143
+ "base_src_cmd_fns",
144
+ "base_dest_cmd_fns",
145
+ ]
@@ -0,0 +1,69 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.airtable import SimpleAirtableConfig
11
+
12
+
13
+ @dataclass
14
+ class AirtableCliConfig(SimpleAirtableConfig, CliConfig):
15
+ @staticmethod
16
+ def get_cli_options() -> t.List[click.Option]:
17
+ options = [
18
+ click.Option(
19
+ ["--personal-access-token"],
20
+ default=None,
21
+ help="Personal access token to authenticate into Airtable. Check: "
22
+ "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
23
+ "for more info",
24
+ ),
25
+ click.Option(
26
+ ["--list-of-paths"],
27
+ default=None,
28
+ help="""
29
+ A list of paths that specify the locations to ingest data from within Airtable.
30
+
31
+ If this argument is not set, the connector ingests all tables within each and every base.
32
+ --list-of-paths: path1 path2 path3 ….
33
+ path: base_id/table_id(optional)/view_id(optional)/
34
+
35
+ To obtain (base, table, view) ids in bulk, check:
36
+ https://airtable.com/developers/web/api/list-bases (base ids)
37
+ https://airtable.com/developers/web/api/get-base-schema (table and view ids)
38
+ https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
39
+
40
+ To obtain specific ids from Airtable UI, go to your workspace, and copy any
41
+ relevant id from the URL structure:
42
+ https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
43
+ appAbcDeF1ghijKlm -> base_id
44
+ tblABcdEfG1HIJkLm -> table_id
45
+ viwABCDEfg6hijKLM -> view_id
46
+
47
+ You can also check: https://support.airtable.com/docs/finding-airtable-ids
48
+
49
+ Here is an example for one --list-of-paths:
50
+ base1/ → gets the entirety of all tables inside base1
51
+ base1/table1 → gets all rows and columns within table1 in base1
52
+ base1/table1/view1 → gets the rows and columns that are
53
+ visible in view1 for the table1 in base1
54
+
55
+ Examples to invalid airtable_paths:
56
+ table1 → has to mention base to be valid
57
+ base1/view1 → has to mention table to be valid
58
+ """,
59
+ ),
60
+ ]
61
+ return options
62
+
63
+
64
+ def get_base_src_cmd() -> BaseSrcCmd:
65
+ cmd_cls = BaseSrcCmd(
66
+ cmd_name="airtable",
67
+ cli_config=AirtableCliConfig,
68
+ )
69
+ return cmd_cls
@@ -0,0 +1,99 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import CliConfig, Dict
7
+ from unstructured_ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
8
+
9
+
10
+ @dataclass
11
+ class AstraCliConfig(SimpleAstraConfig, CliConfig):
12
+ @staticmethod
13
+ def get_cli_options() -> t.List[click.Option]:
14
+ options = [
15
+ click.Option(
16
+ ["--token"],
17
+ required=True,
18
+ type=str,
19
+ help="Astra DB Token with access to the database.",
20
+ envvar="ASTRA_DB_APPLICATION_TOKEN",
21
+ show_envvar=True,
22
+ ),
23
+ click.Option(
24
+ ["--api-endpoint"],
25
+ required=True,
26
+ type=str,
27
+ help="The API endpoint for the Astra DB.",
28
+ envvar="ASTRA_DB_API_ENDPOINT",
29
+ show_envvar=True,
30
+ ),
31
+ click.Option(
32
+ ["--collection-name"],
33
+ required=False,
34
+ type=str,
35
+ help="The name of the Astra DB collection. "
36
+ "Note that the collection name must only include letters, "
37
+ "numbers, and underscores.",
38
+ ),
39
+ click.Option(
40
+ ["--namespace"],
41
+ required=False,
42
+ default=None,
43
+ type=str,
44
+ help="The Astra DB connection namespace.",
45
+ ),
46
+ ]
47
+ return options
48
+
49
+
50
+ @dataclass
51
+ class AstraCliWriteConfig(AstraWriteConfig, CliConfig):
52
+ @staticmethod
53
+ def get_cli_options() -> t.List[click.Option]:
54
+ options = [
55
+ click.Option(
56
+ ["--embedding-dimension"],
57
+ required=True,
58
+ default=384,
59
+ type=int,
60
+ help="The dimensionality of the embeddings",
61
+ ),
62
+ click.Option(
63
+ ["--requested-indexing-policy"],
64
+ required=False,
65
+ default=None,
66
+ type=Dict(),
67
+ help="The indexing policy to use for the collection."
68
+ 'example: \'{"deny": ["metadata"]}\' ',
69
+ ),
70
+ click.Option(
71
+ ["--batch-size"],
72
+ default=20,
73
+ type=int,
74
+ help="Number of records per batch",
75
+ ),
76
+ ]
77
+ return options
78
+
79
+
80
+ def get_base_src_cmd():
81
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
82
+
83
+ cmd_cls = BaseSrcCmd(
84
+ cmd_name="astra",
85
+ cli_config=AstraCliConfig,
86
+ )
87
+ return cmd_cls
88
+
89
+
90
+ def get_base_dest_cmd():
91
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
92
+
93
+ cmd_cls = BaseDestCmd(
94
+ cmd_name="astra",
95
+ cli_config=AstraCliConfig,
96
+ additional_cli_options=[AstraCliWriteConfig],
97
+ write_config=AstraWriteConfig,
98
+ )
99
+ return cmd_cls
@@ -0,0 +1,65 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.interfaces import (
7
+ CliConfig,
8
+ )
9
+ from unstructured_ingest.connector.azure_cognitive_search import (
10
+ AzureCognitiveSearchWriteConfig,
11
+ SimpleAzureCognitiveSearchStorageConfig,
12
+ )
13
+
14
+
15
+ @dataclass
16
+ class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--key"],
22
+ required=True,
23
+ type=str,
24
+ help="Key credential used for authenticating to an Azure service.",
25
+ envvar="AZURE_SEARCH_API_KEY",
26
+ show_envvar=True,
27
+ ),
28
+ click.Option(
29
+ ["--endpoint"],
30
+ required=True,
31
+ type=str,
32
+ help="The URL endpoint of an Azure search service. "
33
+ "In the form of https://{{service_name}}.search.windows.net",
34
+ envvar="AZURE_SEARCH_ENDPOINT",
35
+ show_envvar=True,
36
+ ),
37
+ ]
38
+ return options
39
+
40
+
41
+ @dataclass
42
+ class AzureCognitiveSearchCliWriteConfig(AzureCognitiveSearchWriteConfig, CliConfig):
43
+ @staticmethod
44
+ def get_cli_options() -> t.List[click.Option]:
45
+ options = [
46
+ click.Option(
47
+ ["--index"],
48
+ required=True,
49
+ type=str,
50
+ help="The name of the index to connect to",
51
+ ),
52
+ ]
53
+ return options
54
+
55
+
56
+ def get_base_dest_cmd():
57
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
58
+
59
+ cmd_cls = BaseDestCmd(
60
+ cmd_name="azure-cognitive-search",
61
+ cli_config=AzureCognitiveSearchCliConfig,
62
+ additional_cli_options=[AzureCognitiveSearchCliWriteConfig],
63
+ write_config=AzureCognitiveSearchCliWriteConfig,
64
+ )
65
+ return cmd_cls
@@ -0,0 +1,52 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.biomed import SimpleBiomedConfig
11
+
12
+
13
+ @dataclass
14
+ class BiomedCliConfig(SimpleBiomedConfig, CliConfig):
15
+ @staticmethod
16
+ def get_cli_options() -> t.List[click.Option]:
17
+ options = [
18
+ click.Option(
19
+ ["--api-id"],
20
+ default=None,
21
+ help="ID parameter for OA Web Service API.",
22
+ ),
23
+ click.Option(
24
+ ["--api-from"],
25
+ default=None,
26
+ help="From parameter for OA Web Service API.",
27
+ ),
28
+ click.Option(
29
+ ["--api-until"],
30
+ default=None,
31
+ help="Until parameter for OA Web Service API.",
32
+ ),
33
+ click.Option(
34
+ ["--path"],
35
+ default=None,
36
+ help="PMC Open Access FTP Directory Path.",
37
+ ),
38
+ click.Option(
39
+ ["--max-request-time"],
40
+ default=45,
41
+ help="(In seconds) Max request time to OA Web Service API.",
42
+ ),
43
+ ]
44
+ return options
45
+
46
+
47
+ def get_base_src_cmd() -> BaseSrcCmd:
48
+ cmd_cls = BaseSrcCmd(
49
+ cmd_name="biomed",
50
+ cli_config=BiomedCliConfig,
51
+ )
52
+ return cmd_cls