unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,51 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.fsspec.dropbox import (
11
+ DropboxWriteConfig,
12
+ SimpleDropboxConfig,
13
+ )
14
+
15
+ CMD_NAME = "dropbox"
16
+
17
+
18
+ @dataclass
19
+ class DropboxCliConfig(SimpleDropboxConfig, CliConfig):
20
+ @staticmethod
21
+ def get_cli_options() -> t.List[click.Option]:
22
+ options = [
23
+ click.Option(
24
+ ["--token"],
25
+ required=True,
26
+ type=str,
27
+ help="Dropbox access token.",
28
+ ),
29
+ ]
30
+ return options
31
+
32
+
33
+ def get_base_src_cmd() -> BaseSrcCmd:
34
+ cmd_cls = BaseSrcCmd(
35
+ cmd_name=CMD_NAME,
36
+ cli_config=DropboxCliConfig,
37
+ is_fsspec=True,
38
+ )
39
+ return cmd_cls
40
+
41
+
42
+ def get_base_dest_cmd():
43
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
44
+
45
+ cmd_cls = BaseDestCmd(
46
+ cmd_name=CMD_NAME,
47
+ cli_config=DropboxCliConfig,
48
+ write_config=DropboxWriteConfig,
49
+ is_fsspec=True,
50
+ )
51
+ return cmd_cls
@@ -0,0 +1,15 @@
1
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
2
+
3
+ CMD_NAME = "fsspec"
4
+
5
+
6
+ def get_base_src_cmd() -> BaseSrcCmd:
7
+ cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, is_fsspec=True)
8
+ return cmd_cls
9
+
10
+
11
+ def get_base_dest_cmd():
12
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
13
+
14
+ cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, is_fsspec=True)
15
+ return cmd_cls
@@ -0,0 +1,71 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ FileOrJson,
10
+ )
11
+ from unstructured_ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig
12
+
13
+ CMD_NAME = "gcs"
14
+
15
+
16
+ @dataclass
17
+ class GcsCliConfig(SimpleGcsConfig, CliConfig):
18
+ @staticmethod
19
+ def get_cli_options() -> t.List[click.Option]:
20
+ help_string = """
21
+ Options:
22
+ - ``None``, GCSFS will attempt to guess your credentials in the
23
+ following order: gcloud CLI default, gcsfs cached token, google compute
24
+ metadata service, anonymous.
25
+ - ``'google_default'``, your default gcloud credentials will be used,
26
+ which are typically established by doing ``gcloud login`` in a terminal.
27
+ - ``'cache'``, credentials from previously successful gcsfs
28
+ authentication will be used (use this after "browser" auth succeeded)
29
+ - ``'anon'``, no authentication is performed, and you can only
30
+ access data which is accessible to allUsers (in this case, the project and
31
+ access level parameters are meaningless)
32
+ - ``'browser'``, you get an access code with which you can
33
+ authenticate via a specially provided URL
34
+ - if ``'cloud'``, we assume we are running within google compute
35
+ or google container engine, and query the internal metadata directly for
36
+ a token.
37
+ - you may supply a token generated by the
38
+ [gcloud](https://cloud.google.com/sdk/docs/)
39
+ utility; this is either a python dictionary or the name of a file
40
+ containing the JSON returned by logging in with the gcloud CLI tool.
41
+ """
42
+ options = [
43
+ click.Option(
44
+ ["--service-account-key"],
45
+ default=None,
46
+ type=FileOrJson(allow_raw_str=True),
47
+ help=help_string,
48
+ ),
49
+ ]
50
+ return options
51
+
52
+
53
+ def get_base_src_cmd() -> BaseSrcCmd:
54
+ cmd_cls = BaseSrcCmd(
55
+ cmd_name=CMD_NAME,
56
+ cli_config=GcsCliConfig,
57
+ is_fsspec=True,
58
+ )
59
+ return cmd_cls
60
+
61
+
62
+ def get_base_dest_cmd():
63
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
64
+
65
+ cmd_cls = BaseDestCmd(
66
+ cmd_name=CMD_NAME,
67
+ cli_config=GcsCliConfig,
68
+ write_config=GcsWriteConfig,
69
+ is_fsspec=True,
70
+ )
71
+ return cmd_cls
@@ -0,0 +1,74 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config
11
+
12
+ CMD_NAME = "s3"
13
+
14
+
15
+ @dataclass
16
+ class S3CliConfig(SimpleS3Config, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--anonymous"],
22
+ is_flag=True,
23
+ default=False,
24
+ help="Connect to s3 without local AWS credentials.",
25
+ ),
26
+ click.Option(
27
+ ["--endpoint-url"],
28
+ type=str,
29
+ default=None,
30
+ help="Use this endpoint_url, if specified. Needed for "
31
+ "connecting to non-AWS S3 buckets.",
32
+ ),
33
+ click.Option(
34
+ ["--key"],
35
+ type=str,
36
+ default=None,
37
+ help="If not anonymous, use this access key ID, if specified. Takes precedence "
38
+ "over `aws_access_key_id` in client_kwargs.",
39
+ ),
40
+ click.Option(
41
+ ["--secret"],
42
+ type=str,
43
+ default=None,
44
+ help="If not anonymous, use this secret access key, if specified.",
45
+ ),
46
+ click.Option(
47
+ ["--token"],
48
+ type=str,
49
+ default=None,
50
+ help="If not anonymous, use this security token, if specified.",
51
+ ),
52
+ ]
53
+ return options
54
+
55
+
56
+ def get_base_src_cmd():
57
+ cmd_cls = BaseSrcCmd(
58
+ cmd_name=CMD_NAME,
59
+ cli_config=S3CliConfig,
60
+ is_fsspec=True,
61
+ )
62
+ return cmd_cls
63
+
64
+
65
+ def get_base_dest_cmd():
66
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
67
+
68
+ cmd_cls = BaseDestCmd(
69
+ cmd_name=CMD_NAME,
70
+ cli_config=S3CliConfig,
71
+ write_config=S3WriteConfig,
72
+ is_fsspec=True,
73
+ )
74
+ return cmd_cls
@@ -0,0 +1,58 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ )
10
+ from unstructured_ingest.connector.fsspec.sftp import SimpleSftpConfig
11
+
12
+ CMD_NAME = "sftp"
13
+
14
+
15
+ @dataclass
16
+ class SftpCliConfig(SimpleSftpConfig, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--username"],
22
+ required=True,
23
+ type=str,
24
+ help="Username for sftp connection",
25
+ ),
26
+ click.Option(
27
+ ["--password"],
28
+ required=True,
29
+ type=str,
30
+ help="Password for sftp connection",
31
+ ),
32
+ click.Option(
33
+ ["--look-for-keys"],
34
+ required=False,
35
+ default=False,
36
+ is_flag=True,
37
+ type=bool,
38
+ help="Whether to search for private key files in ~/.ssh/",
39
+ ),
40
+ click.Option(
41
+ ["--allow-agent"],
42
+ required=False,
43
+ default=False,
44
+ is_flag=True,
45
+ type=bool,
46
+ help="Whether to connect to the SSH agent.",
47
+ ),
48
+ ]
49
+ return options
50
+
51
+
52
+ def get_base_src_cmd() -> BaseSrcCmd:
53
+ cmd_cls = BaseSrcCmd(
54
+ cmd_name=CMD_NAME,
55
+ cli_config=SftpCliConfig,
56
+ is_fsspec=True,
57
+ )
58
+ return cmd_cls
@@ -0,0 +1,54 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import CliConfig, DelimitedString
8
+ from unstructured_ingest.connector.github import SimpleGitHubConfig
9
+
10
+
11
+ @dataclass
12
+ class GithubCliConfig(SimpleGitHubConfig, CliConfig):
13
+ @staticmethod
14
+ def get_cli_options() -> t.List[click.Option]:
15
+ options = [
16
+ click.Option(
17
+ ["--url"],
18
+ required=True,
19
+ type=str,
20
+ help="URL to GitHub repository, e.g. "
21
+ '"https://github.com/Unstructured-IO/unstructured", or '
22
+ 'a repository owner/name pair, e.g. "Unstructured-IO/unstructured"',
23
+ ),
24
+ click.Option(
25
+ ["--git-access-token"],
26
+ default=None,
27
+ help="A GitHub or GitLab access token, "
28
+ "see https://docs.github.com/en/authentication or "
29
+ "https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens",
30
+ ),
31
+ click.Option(
32
+ ["--git-branch"],
33
+ default=None,
34
+ type=str,
35
+ help="The branch for which to fetch files from. If not given,"
36
+ " the default repository branch is used.",
37
+ ),
38
+ click.Option(
39
+ ["--git-file-glob"],
40
+ default=None,
41
+ type=DelimitedString(),
42
+ help="A comma-separated list of file globs to limit which "
43
+ "types of files are accepted, e.g. '*.html,*.txt'",
44
+ ),
45
+ ]
46
+ return options
47
+
48
+
49
+ def get_base_src_cmd() -> BaseSrcCmd:
50
+ cmd_cls = BaseSrcCmd(
51
+ cmd_name="github",
52
+ cli_config=GithubCliConfig,
53
+ )
54
+ return cmd_cls
@@ -0,0 +1,54 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import CliConfig, DelimitedString
8
+ from unstructured_ingest.connector.gitlab import SimpleGitlabConfig
9
+
10
+
11
+ @dataclass
12
+ class GitlabCliConfig(SimpleGitlabConfig, CliConfig):
13
+ @staticmethod
14
+ def get_cli_options() -> t.List[click.Option]:
15
+ options = [
16
+ click.Option(
17
+ ["--url"],
18
+ required=True,
19
+ type=str,
20
+ help="URL to GitHub repository, e.g. "
21
+ '"https://github.com/Unstructured-IO/unstructured", or '
22
+ 'a repository owner/name pair, e.g. "Unstructured-IO/unstructured"',
23
+ ),
24
+ click.Option(
25
+ ["--git-access-token"],
26
+ default=None,
27
+ help="A GitHub or GitLab access token, "
28
+ "see https://docs.github.com/en/authentication or "
29
+ "https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens",
30
+ ),
31
+ click.Option(
32
+ ["--git-branch"],
33
+ default=None,
34
+ type=str,
35
+ help="The branch for which to fetch files from. If not given,"
36
+ " the default repository branch is used.",
37
+ ),
38
+ click.Option(
39
+ ["--git-file-glob"],
40
+ default=None,
41
+ type=DelimitedString(),
42
+ help="A comma-separated list of file globs to limit which types of "
43
+ "files are accepted, e.g. '*.html,*.txt'",
44
+ ),
45
+ ]
46
+ return options
47
+
48
+
49
+ def get_base_src_cmd() -> BaseSrcCmd:
50
+ cmd_cls = BaseSrcCmd(
51
+ cmd_name="gitlab",
52
+ cli_config=GitlabCliConfig,
53
+ )
54
+ return cmd_cls
@@ -0,0 +1,49 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ CliRecursiveConfig,
10
+ FileOrJson,
11
+ )
12
+ from unstructured_ingest.connector.google_drive import SimpleGoogleDriveConfig
13
+
14
+
15
+ @dataclass
16
+ class GoogleDriveCliConfig(SimpleGoogleDriveConfig, CliConfig):
17
+ @staticmethod
18
+ def get_cli_options() -> t.List[click.Option]:
19
+ options = [
20
+ click.Option(
21
+ ["--drive-id"],
22
+ required=True,
23
+ type=str,
24
+ help="Google Drive File or Folder ID.",
25
+ ),
26
+ click.Option(
27
+ ["--service-account-key"],
28
+ required=True,
29
+ type=FileOrJson(),
30
+ help="Either the file path of the credentials file to use or a json string of "
31
+ "those values to use for authentication",
32
+ ),
33
+ click.Option(
34
+ ["--extension"],
35
+ default=None,
36
+ type=str,
37
+ help="Filters the files to be processed based on extension e.g. .jpg, .docx, etc.",
38
+ ),
39
+ ]
40
+ return options
41
+
42
+
43
+ def get_base_src_cmd() -> BaseSrcCmd:
44
+ cmd_cls = BaseSrcCmd(
45
+ cmd_name="google-drive",
46
+ cli_config=GoogleDriveCliConfig,
47
+ additional_cli_options=[CliRecursiveConfig],
48
+ )
49
+ return cmd_cls
@@ -0,0 +1,70 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import CliMixin, DelimitedString, Dict
8
+ from unstructured_ingest.connector.hubspot import HubSpotObjectTypes, SimpleHubSpotConfig
9
+
10
+ OBJECT_TYPES = {t.value for t in HubSpotObjectTypes}
11
+
12
+
13
+ def validate_custom_property(ctx, param, value) -> t.Dict[str, t.List[str]]:
14
+ if not value:
15
+ return value
16
+ for k in value:
17
+ if k not in OBJECT_TYPES:
18
+ raise ValueError(f"Invalid object type: {k}, must be one of {OBJECT_TYPES}")
19
+ if not isinstance(value[k], list):
20
+ raise ValueError(f"Invalid type: {type(value[k])}, must be a Python list.")
21
+ return value
22
+
23
+
24
+ @dataclass
25
+ class HubSpotCliConfig(SimpleHubSpotConfig, CliMixin):
26
+ @staticmethod
27
+ def get_cli_options() -> t.List[click.Option]:
28
+ options = [
29
+ click.Option(
30
+ ["--api-token"],
31
+ required=True,
32
+ type=str,
33
+ help="Access token to perform operations on Hubspot. \
34
+ Check \
35
+ https://developers.hubspot.com/docs/api/private-apps/ \
36
+ for more info",
37
+ ),
38
+ click.Option(
39
+ ["--object-types"],
40
+ default=None,
41
+ required=False,
42
+ type=DelimitedString(choices=OBJECT_TYPES),
43
+ is_flag=False,
44
+ help=f"Object to include in the process.\
45
+ Must be a subset of {','.join(OBJECT_TYPES)}.\
46
+ If the argument is omitted all objects listed will be processed.",
47
+ ),
48
+ click.Option(
49
+ ["--custom-properties"],
50
+ default=None,
51
+ required=False,
52
+ type=Dict(),
53
+ is_flag=False,
54
+ callback=validate_custom_property,
55
+ help="Custom property to process information from.\
56
+ It should be a json-like string in the form\
57
+ <object_type>:[<custom_property_id>, ..., <custom_property_id>]\
58
+ Must be internal name of the variable. If the property is missing, \
59
+ it will be omitted.",
60
+ ),
61
+ ]
62
+ return options
63
+
64
+
65
+ def get_base_src_cmd() -> BaseSrcCmd:
66
+ cmd_cls = BaseSrcCmd(
67
+ cmd_name="hubspot",
68
+ cli_config=HubSpotCliConfig,
69
+ )
70
+ return cmd_cls
@@ -0,0 +1,71 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import (
8
+ CliConfig,
9
+ DelimitedString,
10
+ )
11
+ from unstructured_ingest.connector.jira import SimpleJiraConfig
12
+
13
+
14
+ @dataclass
15
+ class JiraCliConfig(SimpleJiraConfig, CliConfig):
16
+ @staticmethod
17
+ def get_cli_options() -> t.List[click.Option]:
18
+ options = [
19
+ click.Option(
20
+ ["--api-token"],
21
+ required=True,
22
+ type=str,
23
+ help="API Token to authenticate into Jira (into Atlassian). \
24
+ Check \
25
+ https://developer.atlassian.com/cloud/jira/platform/basic-auth-for-rest-apis/ \
26
+ for more info.",
27
+ ),
28
+ click.Option(
29
+ ["--url"],
30
+ required=True,
31
+ type=str,
32
+ help="URL to Atlassian (Jira) Cloud, e.g. "
33
+ '"unstructured-jira-connector-test.atlassian.net"',
34
+ ),
35
+ click.Option(
36
+ ["--user-email"],
37
+ required=True,
38
+ type=str,
39
+ help="Email to authenticate into Atlassian (Jira) Cloud.",
40
+ ),
41
+ click.Option(
42
+ ["--projects"],
43
+ default=None,
44
+ type=DelimitedString(),
45
+ help="Comma-delimited Project ids or keys. Use Jira UI or the "
46
+ "API to find or obtain keys. Alternatively, use API to obtain ids.",
47
+ ),
48
+ click.Option(
49
+ ["--boards"],
50
+ default=None,
51
+ type=DelimitedString(),
52
+ help="Comma-delimited Board ids. Check board URL, or use the "
53
+ "API to find the board ids.",
54
+ ),
55
+ click.Option(
56
+ ["--issues"],
57
+ default=None,
58
+ type=DelimitedString(),
59
+ help="Comma-delimited Issue ids or keys. Use Jira UI or the API to "
60
+ "find or obtain keys. Alternatively, use API to obtain ids.",
61
+ ),
62
+ ]
63
+ return options
64
+
65
+
66
+ def get_base_src_cmd() -> BaseSrcCmd:
67
+ cmd_cls = BaseSrcCmd(
68
+ cmd_name="jira",
69
+ cli_config=JiraCliConfig,
70
+ )
71
+ return cmd_cls
@@ -0,0 +1,102 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ import click
5
+
6
+ from unstructured_ingest.cli.base.src import BaseSrcCmd
7
+ from unstructured_ingest.cli.interfaces import CliConfig
8
+ from unstructured_ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig
9
+
10
+ CMD_NAME = "kafka"
11
+
12
+
13
+ @dataclass
14
+ class KafkaCliConfig(SimpleKafkaConfig, CliConfig):
15
+ @staticmethod
16
+ def get_cli_options() -> t.List[click.Option]:
17
+ options = [
18
+ click.Option(
19
+ ["--bootstrap-server"], required=True, type=str, help="Broker server hostname"
20
+ ),
21
+ click.Option(
22
+ ["--port"],
23
+ required=True,
24
+ type=str,
25
+ help="The bootstrap port",
26
+ ),
27
+ click.Option(
28
+ ["--topic"],
29
+ required=True,
30
+ type=str,
31
+ help="The topic to write into.'",
32
+ ),
33
+ click.Option(
34
+ ["--kafka-api-key"],
35
+ required=False,
36
+ type=str,
37
+ help="The API KEY",
38
+ ),
39
+ click.Option(
40
+ ["--secret"],
41
+ required=False,
42
+ type=str,
43
+ help="The secret",
44
+ ),
45
+ click.Option(
46
+ ["--num-messages-to-consume"],
47
+ required=False,
48
+ type=int,
49
+ default=1,
50
+ help="The number of messages to consume before unblocking the consumer",
51
+ ),
52
+ click.Option(
53
+ ["--timeout"],
54
+ required=False,
55
+ type=float,
56
+ default=1.0,
57
+ help="Maximum time to block waiting for message(Seconds)",
58
+ ),
59
+ click.Option(
60
+ ["--confluent"],
61
+ required=False,
62
+ type=bool,
63
+ default=True,
64
+ help="Whether this Kafka instance is from Confluent",
65
+ ),
66
+ ]
67
+ return options
68
+
69
+
70
+ @dataclass
71
+ class KafkaCliWriteConfig(KafkaWriteConfig, CliConfig):
72
+ @staticmethod
73
+ def get_cli_options() -> t.List[click.Option]:
74
+ options = [
75
+ click.Option(
76
+ ["--batch-size"],
77
+ default=4,
78
+ type=int,
79
+ help="Number of records per batch",
80
+ ),
81
+ ]
82
+ return options
83
+
84
+
85
+ def get_base_src_cmd() -> BaseSrcCmd:
86
+ cmd_cls = BaseSrcCmd(
87
+ cmd_name=CMD_NAME,
88
+ cli_config=KafkaCliConfig,
89
+ )
90
+ return cmd_cls
91
+
92
+
93
+ def get_base_dest_cmd():
94
+ from unstructured_ingest.cli.base.dest import BaseDestCmd
95
+
96
+ cmd_cls = BaseDestCmd(
97
+ cmd_name=CMD_NAME,
98
+ cli_config=KafkaCliConfig,
99
+ additional_cli_options=[KafkaCliWriteConfig],
100
+ write_config=KafkaWriteConfig,
101
+ )
102
+ return cmd_cls