unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,85 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Any
4
+
5
+ import click
6
+ from pydantic import BaseModel
7
+
8
+ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
9
+ from unstructured_ingest.v2.cli.utils.click import Group, conform_click_options
10
+ from unstructured_ingest.v2.cli.utils.model_conversion import options_from_base_model
11
+ from unstructured_ingest.v2.interfaces import ProcessorConfig
12
+ from unstructured_ingest.v2.logger import logger
13
+ from unstructured_ingest.v2.processes import (
14
+ ChunkerConfig,
15
+ EmbedderConfig,
16
+ FiltererConfig,
17
+ PartitionerConfig,
18
+ )
19
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
20
+
21
+
22
+ @dataclass
23
+ class SrcCmd(BaseCmd):
24
+ registry_entry: SourceRegistryEntry
25
+ default_configs: list[BaseModel] = field(
26
+ default_factory=lambda: [
27
+ ProcessorConfig,
28
+ PartitionerConfig,
29
+ EmbedderConfig,
30
+ FiltererConfig,
31
+ ChunkerConfig,
32
+ ]
33
+ )
34
+
35
+ def get_registry_options(self):
36
+ options = []
37
+ configs = [
38
+ config
39
+ for config in [
40
+ self.registry_entry.connection_config,
41
+ self.registry_entry.indexer_config,
42
+ self.registry_entry.downloader_config,
43
+ ]
44
+ if config
45
+ ]
46
+ for config in configs:
47
+ options.extend(options_from_base_model(model=config))
48
+ options = self.consolidate_options(options=options)
49
+ return options
50
+
51
+ def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
52
+ if ctx.invoked_subcommand:
53
+ return
54
+
55
+ conform_click_options(options)
56
+ logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
57
+ try:
58
+ pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
59
+ pipeline.run()
60
+ except Exception as e:
61
+ logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
62
+ raise click.ClickException(str(e)) from e
63
+
64
+ def get_cmd(self) -> click.Group:
65
+ # Dynamically create the command without the use of click decorators
66
+ fn = self.cmd
67
+ fn = click.pass_context(fn)
68
+ cmd = click.group(fn, cls=Group)
69
+ if not isinstance(cmd, click.core.Group):
70
+ raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}")
71
+ cmd.name = self.cli_cmd_name
72
+ cmd.short_help = "v2"
73
+ cmd.invoke_without_command = True
74
+ self.add_options(cmd)
75
+
76
+ # TODO remove after v1 no longer supported
77
+ cmd.params.append(
78
+ click.Option(
79
+ ["--output-dir"],
80
+ required=False,
81
+ type=str,
82
+ help="Local path to write partitioned output to",
83
+ )
84
+ )
85
+ return cmd
@@ -0,0 +1,24 @@
1
+ import click
2
+
3
+ from unstructured_ingest.v2.cli.cmds import dest, src
4
+
5
+
6
+ @click.group()
7
+ def ingest():
8
+ pass
9
+
10
+
11
+ def get_cmd() -> click.Command:
12
+ """Construct and return a Click command object representing the main command for the CLI.
13
+
14
+ This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those
15
+ to the main command as nested subcommands.
16
+ """
17
+ cmd = ingest
18
+ # Add all subcommands
19
+ for src_subcommand in src:
20
+ # Add all destination subcommands
21
+ for dest_subcommand in dest:
22
+ src_subcommand.add_command(dest_subcommand)
23
+ cmd.add_command(src_subcommand)
24
+ return cmd
@@ -0,0 +1,14 @@
1
+ import click
2
+
3
+ from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
4
+ from unstructured_ingest.v2.processes.connector_registry import (
5
+ destination_registry,
6
+ source_registry,
7
+ )
8
+
9
+ src_cmds = [SrcCmd(cmd_name=k, registry_entry=v) for k, v in source_registry.items()]
10
+ dest_cmds = [DestCmd(cmd_name=k, registry_entry=v) for k, v in destination_registry.items()]
11
+
12
+ src: list[click.Group] = [v.get_cmd() for v in src_cmds]
13
+
14
+ dest: list[click.Command] = [v.get_cmd() for v in dest_cmds]
File without changes
@@ -0,0 +1,237 @@
1
+ import json
2
+ import os.path
3
+ from datetime import date, datetime
4
+ from gettext import gettext, ngettext
5
+ from gettext import gettext as _
6
+ from pathlib import Path
7
+ from typing import Any, Optional, Type, TypeVar, Union
8
+
9
+ import click
10
+ from pydantic import BaseModel, ConfigDict, Secret, TypeAdapter, ValidationError
11
+
12
+
13
+ def conform_click_options(options: dict):
14
+ # Click sets all multiple fields as tuple, this needs to be updated to list
15
+ for k, v in options.items():
16
+ if isinstance(v, tuple):
17
+ options[k] = list(v)
18
+
19
+
20
+ class Dict(click.ParamType):
21
+ name = "dict"
22
+
23
+ def convert(
24
+ self,
25
+ value: Any,
26
+ param: Optional[click.Parameter] = None,
27
+ ctx: Optional[click.Context] = None,
28
+ ) -> Any:
29
+ try:
30
+ if isinstance(value, dict):
31
+ return value
32
+ if isinstance(value, Path) and value.is_file():
33
+ with value.open() as f:
34
+ return json.load(f)
35
+ if isinstance(value, str):
36
+ return json.loads(value)
37
+ except json.JSONDecodeError:
38
+ self.fail(
39
+ gettext(
40
+ "{value} is not a valid json value.",
41
+ ).format(value=value),
42
+ param,
43
+ ctx,
44
+ )
45
+
46
+
47
+ class FileOrJson(click.ParamType):
48
+ name = "file-or-json"
49
+
50
+ def __init__(self, allow_raw_str: bool = False):
51
+ self.allow_raw_str = allow_raw_str
52
+
53
+ def convert(
54
+ self,
55
+ value: Any,
56
+ param: Optional[click.Parameter] = None,
57
+ ctx: Optional[click.Context] = None,
58
+ ) -> Any:
59
+ # check if valid file
60
+ full_path = os.path.abspath(os.path.expanduser(value))
61
+ if os.path.isfile(full_path):
62
+ return str(Path(full_path).resolve())
63
+ if isinstance(value, str):
64
+ try:
65
+ return json.loads(value)
66
+ except json.JSONDecodeError:
67
+ if self.allow_raw_str:
68
+ return value
69
+ self.fail(
70
+ gettext(
71
+ "{value} is neither a valid json string nor an existing filepath.",
72
+ ).format(value=value),
73
+ param,
74
+ ctx,
75
+ )
76
+
77
+
78
+ class DelimitedString(click.ParamType):
79
+ name = "delimited-string"
80
+
81
+ def __init__(self, delimiter: str = ",", choices: Optional[list[str]] = None):
82
+ self.choices = choices if choices else []
83
+ self.delimiter = delimiter
84
+
85
+ def convert(
86
+ self,
87
+ value: Any,
88
+ param: Optional[click.Parameter] = None,
89
+ ctx: Optional[click.Context] = None,
90
+ ) -> Any:
91
+ # In case a list is provided as the default, will not break
92
+ if isinstance(value, list):
93
+ split = [str(v).strip() for v in value]
94
+ else:
95
+ split = [v.strip() for v in value.split(self.delimiter)]
96
+ if not self.choices:
97
+ return split
98
+ choices_str = ", ".join(map(repr, self.choices))
99
+ for s in split:
100
+ if s not in self.choices:
101
+ self.fail(
102
+ ngettext(
103
+ "{value!r} is not {choice}.",
104
+ "{value!r} is not one of {choices}.",
105
+ len(self.choices),
106
+ ).format(value=s, choice=choices_str, choices=choices_str),
107
+ param,
108
+ ctx,
109
+ )
110
+ return split
111
+
112
+
113
+ class PydanticDateTime(click.ParamType):
114
+ name = "datetime"
115
+
116
+ def convert(
117
+ self,
118
+ value: Any,
119
+ param: Optional[click.Parameter] = None,
120
+ ctx: Optional[click.Context] = None,
121
+ ) -> Any:
122
+ try:
123
+ return TypeAdapter(datetime).validate_strings(value)
124
+ except ValidationError:
125
+ self.fail(f"{value} is not a valid datetime", param, ctx)
126
+
127
+
128
+ class PydanticDate(click.ParamType):
129
+ name = "date"
130
+
131
+ def convert(
132
+ self,
133
+ value: Any,
134
+ param: Optional[click.Parameter] = None,
135
+ ctx: Optional[click.Context] = None,
136
+ ) -> Any:
137
+ try:
138
+ return TypeAdapter(date).validate_strings(value)
139
+ except ValidationError:
140
+ self.fail(f"{value} is not a valid date", param, ctx)
141
+
142
+
143
+ BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
144
+
145
+
146
+ def unwrap_optional(val: Any) -> tuple[Any, bool]:
147
+ if (
148
+ hasattr(val, "__origin__")
149
+ and hasattr(val, "__args__")
150
+ and val.__origin__ is Union
151
+ and len(val.__args__) == 2
152
+ and type(None) in val.__args__
153
+ ):
154
+ args = val.__args__
155
+ args = [a for a in args if a is not None]
156
+ return args[0], True
157
+ return val, False
158
+
159
+
160
+ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
161
+ fields = config.model_fields
162
+ config.model_config = ConfigDict(extra="ignore")
163
+ field_names = [v.alias or k for k, v in fields.items()]
164
+ data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
165
+ if access_config := fields.get("access_config"):
166
+ access_config_type = access_config.annotation
167
+ access_config_type, is_optional = unwrap_optional(access_config_type)
168
+ # Check if raw type is wrapped by a secret
169
+ if (
170
+ hasattr(access_config_type, "__origin__")
171
+ and hasattr(access_config_type, "__args__")
172
+ and access_config_type.__origin__ is Secret
173
+ ):
174
+ ac_subtypes = access_config_type.__args__
175
+ ac_fields = ac_subtypes[0].model_fields
176
+ elif issubclass(access_config_type, BaseModel):
177
+ ac_fields = access_config_type.model_fields
178
+ else:
179
+ raise TypeError(f"Unrecognized access_config type: {access_config_type}")
180
+ ac_field_names = [v.alias or k for k, v in ac_fields.items()]
181
+ access_config_data = {
182
+ k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
183
+ }
184
+ if not access_config_data and is_optional:
185
+ data["access_config"] = None
186
+ else:
187
+ data["access_config"] = access_config_data
188
+ return config.model_validate(obj=data)
189
+
190
+
191
+ class Group(click.Group):
192
+ def parse_args(self, ctx, args):
193
+ """
194
+ This allows for subcommands to be called with the --help flag without breaking
195
+ if parent command is missing any of its required parameters
196
+ """
197
+ try:
198
+ return super().parse_args(ctx, args)
199
+ except click.MissingParameter:
200
+ if "--help" not in args:
201
+ raise
202
+ # remove the required params so that help can display
203
+ for param in self.params:
204
+ param.required = False
205
+ return super().parse_args(ctx, args)
206
+
207
+ def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None:
208
+ """
209
+ Copy of the original click.Group format_commands() method but replacing
210
+ 'Commands' -> 'Destinations'
211
+ """
212
+ commands = []
213
+ for subcommand in self.list_commands(ctx):
214
+ cmd = self.get_command(ctx, subcommand)
215
+ # What is this, the tool lied about a command. Ignore it
216
+ if cmd is None:
217
+ continue
218
+ if cmd.hidden:
219
+ continue
220
+
221
+ commands.append((subcommand, cmd))
222
+
223
+ # allow for 3 times the default spacing
224
+ if len(commands):
225
+ if formatter.width:
226
+ limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands)
227
+ else:
228
+ limit = -6 - max(len(cmd[0]) for cmd in commands)
229
+
230
+ rows = []
231
+ for subcommand, cmd in commands:
232
+ help = cmd.get_short_help_str(limit)
233
+ rows.append((subcommand, help))
234
+
235
+ if rows:
236
+ with formatter.section(_("Destinations")):
237
+ formatter.write_dl(rows)
@@ -0,0 +1,222 @@
1
+ import contextlib
2
+ import datetime
3
+ from collections import Counter
4
+ from enum import EnumMeta
5
+ from pathlib import Path
6
+ from typing import (
7
+ Annotated,
8
+ Any,
9
+ Callable,
10
+ Literal,
11
+ Optional,
12
+ Type,
13
+ TypedDict,
14
+ Union,
15
+ get_args,
16
+ get_origin,
17
+ )
18
+ from uuid import UUID
19
+
20
+ import click
21
+ from annotated_types import Ge, Gt, Le, Lt, SupportsGe, SupportsGt, SupportsLe, SupportsLt
22
+ from click import Option
23
+ from pydantic import BaseModel, Secret, SecretStr
24
+ from pydantic.fields import FieldInfo
25
+ from pydantic.types import _SecretBase
26
+ from pydantic_core import PydanticUndefined
27
+
28
+ from unstructured_ingest.v2.cli.utils.click import (
29
+ DelimitedString,
30
+ Dict,
31
+ PydanticDate,
32
+ PydanticDateTime,
33
+ )
34
+
35
+ NoneType = type(None)
36
+
37
+
38
+ class _RangeDict(TypedDict, total=False):
39
+ """Represent arguments to `click.IntRange` or `click.FloatRange`."""
40
+
41
+ max: Union[SupportsLt, SupportsLe]
42
+ min: Union[SupportsGt, SupportsGe]
43
+ max_open: bool
44
+ min_open: bool
45
+
46
+
47
+ def get_range_from_metadata(metadata: list[Any]) -> _RangeDict:
48
+ range_args: _RangeDict = {}
49
+ for constraint in metadata:
50
+ if isinstance(constraint, Le):
51
+ range_args["max"] = constraint.le
52
+ range_args["max_open"] = False
53
+ if isinstance(constraint, Lt):
54
+ range_args["max"] = constraint.lt
55
+ range_args["max_open"] = True
56
+ if isinstance(constraint, Ge):
57
+ range_args["min"] = constraint.ge
58
+ range_args["min_open"] = False
59
+ if isinstance(constraint, Gt):
60
+ range_args["min"] = constraint.gt
61
+ range_args["min_open"] = True
62
+ return range_args
63
+
64
+
65
+ def is_boolean_flag(field_info: FieldInfo) -> bool:
66
+ annotation = field_info.annotation
67
+ raw_annotation = get_raw_type(annotation)
68
+ return raw_annotation is bool
69
+
70
+
71
+ def get_raw_type(val: Any) -> Any:
72
+ field_args = get_args(val)
73
+ field_origin = get_origin(val)
74
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
75
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
76
+ return field_type
77
+ if field_origin is Secret and len(field_args) == 1:
78
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
79
+ return field_type
80
+ if val is SecretStr:
81
+ return str
82
+ return val
83
+
84
+
85
+ def get_default_value_from_field(field: FieldInfo) -> Optional[Union[Any, Callable[[], Any]]]:
86
+ if field.default is not PydanticUndefined:
87
+ return field.default
88
+ elif field.default_factory is not None:
89
+ return field.default_factory
90
+ return None
91
+
92
+
93
+ def get_option_name(field_name: str, field_info: FieldInfo) -> str:
94
+ field_name = field_info.alias or field_name
95
+ if field_name.startswith("--"):
96
+ field_name = field_name[2:]
97
+ field_name = field_name.lower().replace("_", "-")
98
+ if is_boolean_flag(field_info):
99
+ return f"--{field_name}/--no-{field_name}"
100
+ return f"--{field_name}"
101
+
102
+
103
+ def get_numerical_type(field: FieldInfo) -> click.ParamType:
104
+ range_args = get_range_from_metadata(field.metadata)
105
+ if field.annotation is int:
106
+ if range_args:
107
+ return click.IntRange(**range_args) # type: ignore[arg-type]
108
+ return click.INT
109
+ # Non-integer numerical types default to float
110
+ if range_args:
111
+ return click.FloatRange(**range_args) # type: ignore[arg-type]
112
+ return click.FLOAT
113
+
114
+
115
+ def get_type_from_annotation(field_type: Any) -> click.ParamType:
116
+ field_origin = get_origin(field_type)
117
+ field_args = get_args(field_type)
118
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
119
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
120
+ return get_type_from_annotation(field_type=field_type)
121
+ if field_origin is Annotated:
122
+ field_origin = field_args[0]
123
+ field_metadata = field_args[1]
124
+ if isinstance(field_metadata, click.ParamType):
125
+ return field_metadata
126
+ if field_origin is Secret and len(field_args) == 1:
127
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
128
+ return get_type_from_annotation(field_type=field_type)
129
+ if field_origin is list and len(field_args) == 1 and field_args[0] is str:
130
+ return DelimitedString()
131
+ if field_type is SecretStr:
132
+ return click.STRING
133
+ if dict in [field_type, field_origin]:
134
+ return Dict()
135
+ if field_type is str:
136
+ return click.STRING
137
+ if field_type is bool:
138
+ return click.BOOL
139
+ if field_type is UUID:
140
+ return click.UUID
141
+ if field_type is Path:
142
+ return click.Path(path_type=Path)
143
+ if field_type is datetime.datetime:
144
+ return PydanticDateTime()
145
+ if field_type is datetime.date:
146
+ return PydanticDate()
147
+ if field_origin is Literal:
148
+ return click.Choice(field_args)
149
+ if isinstance(field_type, EnumMeta):
150
+ values = [i.value for i in field_type]
151
+ return click.Choice(values)
152
+ raise TypeError(f"Unexpected field type: {field_type}")
153
+
154
+
155
+ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
156
+ raw_field_type = get_raw_type(field.annotation)
157
+
158
+ if raw_field_type in (int, float):
159
+ return get_numerical_type(field)
160
+ return get_type_from_annotation(field_type=field.annotation)
161
+
162
+
163
+ def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
164
+ param_decls = [option_name]
165
+ help_text = field_info.description or ""
166
+ if examples := field_info.examples:
167
+ help_text += f" [Examples: {', '.join(examples)}]"
168
+ option_kwargs = {
169
+ "type": _get_type_from_field(field_info),
170
+ "default": get_default_value_from_field(field_info),
171
+ "required": field_info.is_required(),
172
+ "help": str(help_text),
173
+ "is_flag": is_boolean_flag(field_info),
174
+ "show_default": field_info.default is not PydanticUndefined,
175
+ }
176
+ return click.Option(param_decls=param_decls, **option_kwargs)
177
+
178
+
179
+ def is_subclass(x: Any, y: Any) -> bool:
180
+ with contextlib.suppress(TypeError):
181
+ return issubclass(x, y)
182
+
183
+ return False
184
+
185
+
186
+ def post_check(options: list[Option]):
187
+ option_names = [option.name for option in options]
188
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
189
+ if duplicate_names:
190
+ raise ValueError(
191
+ "the following field name were reused, all must be unique: {}".format(
192
+ ", ".join(duplicate_names)
193
+ )
194
+ )
195
+
196
+
197
+ def is_secret(value: Any) -> bool:
198
+ # Case Secret[int]
199
+ if hasattr(value, "__origin__") and hasattr(value, "__args__"):
200
+ origin = value.__origin__
201
+ return is_subclass(origin, _SecretBase)
202
+ # Case SecretStr
203
+ return is_subclass(value, _SecretBase)
204
+
205
+
206
+ def options_from_base_model(model: Union[BaseModel, Type[BaseModel]]) -> list[Option]:
207
+ options = []
208
+ model_fields = model.model_fields
209
+ for field_name, field_info in model_fields.items():
210
+ if field_info.init is False:
211
+ continue
212
+ option_name = get_option_name(field_name=field_name, field_info=field_info)
213
+ raw_annotation = get_raw_type(field_info.annotation)
214
+ if is_subclass(raw_annotation, BaseModel):
215
+ options.extend(options_from_base_model(model=raw_annotation))
216
+ else:
217
+ if is_secret(field_info.annotation):
218
+ field_info.description = f"[sensitive] {field_info.description}"
219
+ options.append(get_option_from_field(option_name=option_name, field_info=field_info))
220
+
221
+ post_check(options=options)
222
+ return options
@@ -0,0 +1,2 @@
1
+ # Used to append to metadata for uploaders that store element-level data
2
+ RECORD_ID_LABEL = "record_id"
@@ -0,0 +1,18 @@
1
+ class UserError(Exception):
2
+ pass
3
+
4
+
5
+ class UserAuthError(UserError):
6
+ pass
7
+
8
+
9
+ class RateLimitError(UserError):
10
+ pass
11
+
12
+
13
+ class QuotaError(UserError):
14
+ pass
15
+
16
+
17
+ class ProviderError(Exception):
18
+ pass
@@ -0,0 +1,32 @@
1
+ from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
+ from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
+ from .file_data import BatchFileData, BatchItem, FileData, FileDataSourceMetadata, SourceIdentifiers
4
+ from .indexer import Indexer, IndexerConfig
5
+ from .process import BaseProcess
6
+ from .processor import ProcessorConfig
7
+ from .upload_stager import UploadStager, UploadStagerConfig
8
+ from .uploader import UploadContent, Uploader, UploaderConfig
9
+
10
+ __all__ = [
11
+ "DownloadResponse",
12
+ "download_responses",
13
+ "Downloader",
14
+ "DownloaderConfig",
15
+ "FileData",
16
+ "Indexer",
17
+ "IndexerConfig",
18
+ "BaseProcess",
19
+ "ProcessorConfig",
20
+ "UploadStager",
21
+ "UploadStagerConfig",
22
+ "Uploader",
23
+ "UploaderConfig",
24
+ "SourceIdentifiers",
25
+ "UploadContent",
26
+ "AccessConfig",
27
+ "ConnectionConfig",
28
+ "BaseConnector",
29
+ "FileDataSourceMetadata",
30
+ "BatchFileData",
31
+ "BatchItem",
32
+ ]
@@ -0,0 +1,50 @@
1
+ from abc import ABC
2
+ from dataclasses import dataclass
3
+ from typing import Any, TypeVar, Union
4
+
5
+ from pydantic import BaseModel, Secret, model_validator
6
+ from pydantic.types import _SecretBase
7
+
8
+
9
+ class AccessConfig(BaseModel):
10
+ """Meant to designate holding any sensitive information associated with other configs
11
+ and also for access specific configs."""
12
+
13
+
14
+ AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
15
+
16
+
17
+ class ConnectionConfig(BaseModel):
18
+ access_config: Secret[AccessConfigT]
19
+
20
+ def get_access_config(self) -> dict[str, Any]:
21
+ if not self.access_config:
22
+ return {}
23
+ return self.access_config.get_secret_value().model_dump()
24
+
25
+ @model_validator(mode="after")
26
+ def check_access_config(self):
27
+ access_config = self.access_config
28
+ if self._is_access_config_optional() and access_config is None:
29
+ return self
30
+ if not isinstance(access_config, _SecretBase):
31
+ raise ValueError("access_config must be an instance of SecretBase")
32
+ return self
33
+
34
+ def _is_access_config_optional(self) -> bool:
35
+ access_config_type = self.model_fields["access_config"].annotation
36
+ return (
37
+ hasattr(access_config_type, "__origin__")
38
+ and hasattr(access_config_type, "__args__")
39
+ and access_config_type.__origin__ is Union
40
+ and len(access_config_type.__args__) == 2
41
+ and type(None) in access_config_type.__args__
42
+ )
43
+
44
+
45
+ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
46
+
47
+
48
+ @dataclass
49
+ class BaseConnector(ABC):
50
+ connection_config: ConnectionConfigT