unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
5
+ add_source_entry,
6
+ )
7
+
8
+ from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE
9
+ from .azure import azure_destination_entry, azure_source_entry
10
+ from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE
11
+ from .box import box_destination_entry, box_source_entry
12
+ from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE
13
+ from .dropbox import dropbox_destination_entry, dropbox_source_entry
14
+ from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE
15
+ from .gcs import gcs_destination_entry, gcs_source_entry
16
+ from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE
17
+ from .s3 import s3_destination_entry, s3_source_entry
18
+ from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE
19
+ from .sftp import sftp_destination_entry, sftp_source_entry
20
+
21
+ add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry)
22
+ add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry)
23
+
24
+ add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry)
25
+ add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry)
26
+
27
+ add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry)
28
+ add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry)
29
+
30
+ add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry)
31
+ add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry)
32
+
33
+ add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry)
34
+ add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry)
35
+
36
+ add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry)
37
+ add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry)
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
12
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
13
+ from unstructured_ingest.v2.logger import logger
14
+ from unstructured_ingest.v2.processes.connector_registry import (
15
+ DestinationRegistryEntry,
16
+ SourceRegistryEntry,
17
+ )
18
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
19
+ FsspecAccessConfig,
20
+ FsspecConnectionConfig,
21
+ FsspecDownloader,
22
+ FsspecDownloaderConfig,
23
+ FsspecIndexer,
24
+ FsspecIndexerConfig,
25
+ FsspecUploader,
26
+ FsspecUploaderConfig,
27
+ )
28
+ from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
29
+
30
+ if TYPE_CHECKING:
31
+ from adlfs import AzureBlobFileSystem
32
+
33
+ CONNECTOR_TYPE = "azure"
34
+
35
+
36
+ def azure_json_serial(obj):
37
+ from azure.storage.blob._models import ContentSettings
38
+
39
+ if isinstance(obj, ContentSettings):
40
+ return dict(obj)
41
+ if isinstance(obj, bytearray):
42
+ return str(obj)
43
+ return json_serial(obj)
44
+
45
+
46
+ class AzureIndexerConfig(FsspecIndexerConfig):
47
+ pass
48
+
49
+
50
+ class AzureAccessConfig(FsspecAccessConfig):
51
+ account_name: Optional[str] = Field(
52
+ default=None,
53
+ description="The storage account name. This is used to authenticate "
54
+ "requests signed with an account key and to construct "
55
+ "the storage endpoint. It is required unless a connection "
56
+ "string is given, or if a custom domain is used with "
57
+ "anonymous authentication.",
58
+ )
59
+ account_key: Optional[str] = Field(
60
+ default=None,
61
+ description="The storage account key. This is used for shared key "
62
+ "authentication. If any of account key, sas token or "
63
+ "client_id are not specified, anonymous access will be used.",
64
+ )
65
+ connection_string: Optional[str] = Field(
66
+ default=None,
67
+ description="If specified, this will override all other parameters. See "
68
+ "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
69
+ "for the connection string format.",
70
+ )
71
+ sas_token: Optional[str] = Field(
72
+ default=None,
73
+ description="A shared access signature token to use to authenticate "
74
+ "requests instead of the account key. If account key and "
75
+ "sas token are both specified, account key will be used "
76
+ "to sign. If any of account key, sas token or client_id "
77
+ "are not specified, anonymous access will be used.",
78
+ )
79
+
80
+ def model_post_init(self, __context: Any) -> None:
81
+ if self.connection_string is None and self.account_name is None:
82
+ raise ValueError("either connection_string or account_name must be set")
83
+
84
+
85
+ class AzureConnectionConfig(FsspecConnectionConfig):
86
+ supported_protocols: list[str] = field(default_factory=lambda: ["az"], init=False)
87
+ access_config: Secret[AzureAccessConfig]
88
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
89
+
90
+ def get_access_config(self) -> dict[str, Any]:
91
+ # Avoid injecting None by filtering out k,v pairs where the value is None
92
+ access_configs: dict[str, Any] = {
93
+ k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
94
+ }
95
+ return access_configs
96
+
97
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
98
+ @contextmanager
99
+ def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
100
+ with super().get_client(protocol=protocol) as client:
101
+ yield client
102
+
103
+ def wrap_error(self, e: Exception) -> Exception:
104
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError
105
+
106
+ if not isinstance(e, HttpResponseError):
107
+ logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
108
+ return e
109
+ if isinstance(e, ClientAuthenticationError):
110
+ return UserAuthError(e.reason)
111
+ status_code = e.status_code
112
+ message = e.reason
113
+ if status_code is not None:
114
+ if 400 <= status_code < 500:
115
+ return UserError(message)
116
+ if status_code >= 500:
117
+ return ProviderError(message)
118
+ logger.error(f"unhandled exception from azure ({type(e)}): {e}", exc_info=True)
119
+ return e
120
+
121
+
122
+ @dataclass
123
+ class AzureIndexer(FsspecIndexer):
124
+ connection_config: AzureConnectionConfig
125
+ index_config: AzureIndexerConfig
126
+ connector_type: str = CONNECTOR_TYPE
127
+
128
+ def sterilize_info(self, file_data: dict) -> dict:
129
+ return sterilize_dict(data=file_data, default=azure_json_serial)
130
+
131
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
132
+ path = file_data["name"]
133
+ date_created = (
134
+ str(file_data.get("creation_time").timestamp())
135
+ if "creation_time" in file_data
136
+ else None
137
+ )
138
+ date_modified = (
139
+ str(file_data.get("last_modified").timestamp())
140
+ if "last_modified" in file_data
141
+ else None
142
+ )
143
+
144
+ file_size = file_data.get("size") if "size" in file_data else None
145
+
146
+ version = file_data.get("etag")
147
+ record_locator = {
148
+ "protocol": self.index_config.protocol,
149
+ "remote_file_path": self.index_config.remote_url,
150
+ }
151
+ return FileDataSourceMetadata(
152
+ date_created=date_created,
153
+ date_modified=date_modified,
154
+ date_processed=str(time()),
155
+ version=version,
156
+ url=f"{self.index_config.protocol}://{path}",
157
+ record_locator=record_locator,
158
+ filesize_bytes=file_size,
159
+ )
160
+
161
+
162
+ class AzureDownloaderConfig(FsspecDownloaderConfig):
163
+ pass
164
+
165
+
166
+ @dataclass
167
+ class AzureDownloader(FsspecDownloader):
168
+ protocol: str = "az"
169
+ connection_config: AzureConnectionConfig
170
+ connector_type: str = CONNECTOR_TYPE
171
+ download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
172
+
173
+
174
+ class AzureUploaderConfig(FsspecUploaderConfig):
175
+ pass
176
+
177
+
178
+ @dataclass
179
+ class AzureUploader(FsspecUploader):
180
+ connector_type: str = CONNECTOR_TYPE
181
+ connection_config: AzureConnectionConfig
182
+ upload_config: AzureUploaderConfig = field(default=None)
183
+
184
+
185
+ azure_source_entry = SourceRegistryEntry(
186
+ indexer=AzureIndexer,
187
+ indexer_config=AzureIndexerConfig,
188
+ downloader=AzureDownloader,
189
+ downloader_config=AzureDownloaderConfig,
190
+ connection_config=AzureConnectionConfig,
191
+ )
192
+
193
+ azure_destination_entry = DestinationRegistryEntry(
194
+ uploader=AzureUploader,
195
+ uploader_config=AzureUploaderConfig,
196
+ connection_config=AzureConnectionConfig,
197
+ )
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
7
+
8
+ from dateutil import parser
9
+ from pydantic import Field, Secret
10
+ from pydantic.functional_validators import BeforeValidator
11
+
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.errors import ProviderError, UserAuthError, UserError
14
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
15
+ from unstructured_ingest.v2.logger import logger
16
+ from unstructured_ingest.v2.processes.connector_registry import (
17
+ DestinationRegistryEntry,
18
+ SourceRegistryEntry,
19
+ )
20
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
21
+ FsspecAccessConfig,
22
+ FsspecConnectionConfig,
23
+ FsspecDownloader,
24
+ FsspecDownloaderConfig,
25
+ FsspecIndexer,
26
+ FsspecIndexerConfig,
27
+ FsspecUploader,
28
+ FsspecUploaderConfig,
29
+ )
30
+ from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
31
+
32
+ if TYPE_CHECKING:
33
+ from boxfs import BoxFileSystem
34
+
35
+ CONNECTOR_TYPE = "box"
36
+
37
+
38
+ class BoxIndexerConfig(FsspecIndexerConfig):
39
+ pass
40
+
41
+
42
+ class BoxAccessConfig(FsspecAccessConfig):
43
+ box_app_config: Annotated[dict, BeforeValidator(conform_string_to_dict)] = Field(
44
+ description="Box app credentials as a JSON string."
45
+ )
46
+
47
+
48
+ class BoxConnectionConfig(FsspecConnectionConfig):
49
+ supported_protocols: list[str] = field(default_factory=lambda: ["box"], init=False)
50
+ access_config: Secret[BoxAccessConfig]
51
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
52
+
53
+ def get_access_config(self) -> dict[str, Any]:
54
+ from boxsdk import JWTAuth
55
+
56
+ ac = self.access_config.get_secret_value()
57
+ settings_dict = ac.box_app_config
58
+
59
+ # Create and authenticate the JWTAuth object
60
+ oauth = JWTAuth.from_settings_dictionary(settings_dict)
61
+ oauth.authenticate_instance()
62
+
63
+ # if not oauth.access_token:
64
+ # raise SourceConnectionError("Authentication failed: No access token generated.")
65
+
66
+ # Prepare the access configuration with the authenticated oauth
67
+ access_kwargs_with_oauth: dict[str, Any] = {
68
+ "oauth": oauth,
69
+ }
70
+ access_config: dict[str, Any] = ac.model_dump()
71
+ access_config.pop("box_app_config", None)
72
+ access_kwargs_with_oauth.update(access_config)
73
+
74
+ return access_kwargs_with_oauth
75
+
76
+ def wrap_error(self, e: Exception) -> Exception:
77
+ from boxsdk.exception import BoxAPIException, BoxOAuthException
78
+
79
+ if isinstance(e, BoxOAuthException):
80
+ return UserAuthError(e.message)
81
+ if not isinstance(e, BoxAPIException):
82
+ logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
83
+ return e
84
+ message = e.message or e
85
+ if error_code_status := e.status:
86
+ if 400 <= error_code_status < 500:
87
+ return UserError(message)
88
+ if error_code_status >= 500:
89
+ return ProviderError(message)
90
+
91
+ logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True)
92
+ return e
93
+
94
+ @requires_dependencies(["boxfs"], extras="box")
95
+ @contextmanager
96
+ def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
97
+ with super().get_client(protocol=protocol) as client:
98
+ yield client
99
+
100
+
101
+ @dataclass
102
+ class BoxIndexer(FsspecIndexer):
103
+ connection_config: BoxConnectionConfig
104
+ index_config: BoxIndexerConfig
105
+ connector_type: str = CONNECTOR_TYPE
106
+
107
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
108
+ path = file_data["name"]
109
+ date_created = None
110
+ date_modified = None
111
+ if modified_at_str := file_data.get("modified_at"):
112
+ date_modified = str(parser.parse(modified_at_str).timestamp())
113
+ if created_at_str := file_data.get("created_at"):
114
+ date_created = str(parser.parse(created_at_str).timestamp())
115
+
116
+ file_size = file_data.get("size") if "size" in file_data else None
117
+
118
+ version = file_data.get("id")
119
+ record_locator = {
120
+ "protocol": self.index_config.protocol,
121
+ "remote_file_path": self.index_config.remote_url,
122
+ "file_id": file_data.get("id"),
123
+ }
124
+ return FileDataSourceMetadata(
125
+ date_created=date_created,
126
+ date_modified=date_modified,
127
+ date_processed=str(time()),
128
+ version=version,
129
+ url=f"{self.index_config.protocol}://{path}",
130
+ record_locator=record_locator,
131
+ filesize_bytes=file_size,
132
+ )
133
+
134
+
135
+ class BoxDownloaderConfig(FsspecDownloaderConfig):
136
+ pass
137
+
138
+
139
+ @dataclass
140
+ class BoxDownloader(FsspecDownloader):
141
+ protocol: str = "box"
142
+ connection_config: BoxConnectionConfig
143
+ connector_type: str = CONNECTOR_TYPE
144
+ download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
145
+
146
+
147
+ class BoxUploaderConfig(FsspecUploaderConfig):
148
+ pass
149
+
150
+
151
+ @dataclass
152
+ class BoxUploader(FsspecUploader):
153
+ connector_type: str = CONNECTOR_TYPE
154
+ connection_config: BoxConnectionConfig
155
+ upload_config: BoxUploaderConfig = field(default=None)
156
+
157
+
158
+ box_source_entry = SourceRegistryEntry(
159
+ indexer=BoxIndexer,
160
+ indexer_config=BoxIndexerConfig,
161
+ downloader=BoxDownloader,
162
+ downloader_config=BoxDownloaderConfig,
163
+ connection_config=BoxConnectionConfig,
164
+ )
165
+
166
+ box_destination_entry = DestinationRegistryEntry(
167
+ uploader=BoxUploader,
168
+ uploader_config=BoxUploaderConfig,
169
+ connection_config=BoxConnectionConfig,
170
+ )
@@ -0,0 +1,168 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Generator, Optional
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
+ from unstructured_ingest.v2.errors import (
12
+ ProviderError,
13
+ UserAuthError,
14
+ UserError,
15
+ )
16
+ from unstructured_ingest.v2.errors import (
17
+ RateLimitError as CustomRateLimitError,
18
+ )
19
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
20
+ from unstructured_ingest.v2.logger import logger
21
+ from unstructured_ingest.v2.processes.connector_registry import (
22
+ DestinationRegistryEntry,
23
+ SourceRegistryEntry,
24
+ )
25
+ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
+ FsspecAccessConfig,
27
+ FsspecConnectionConfig,
28
+ FsspecDownloader,
29
+ FsspecDownloaderConfig,
30
+ FsspecIndexer,
31
+ FsspecIndexerConfig,
32
+ FsspecUploader,
33
+ FsspecUploaderConfig,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from dropboxdrivefs import DropboxDriveFileSystem
38
+
39
+ CONNECTOR_TYPE = "dropbox"
40
+
41
+
42
+ class DropboxIndexerConfig(FsspecIndexerConfig):
43
+ def model_post_init(self, __context):
44
+ if not self.path_without_protocol.startswith("/"):
45
+ self.path_without_protocol = "/" + self.path_without_protocol
46
+
47
+
48
+ class DropboxAccessConfig(FsspecAccessConfig):
49
+ token: Optional[str] = Field(default=None, description="Dropbox access token.")
50
+
51
+
52
+ class DropboxConnectionConfig(FsspecConnectionConfig):
53
+ supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"], init=False)
54
+ access_config: Secret[DropboxAccessConfig] = Field(
55
+ default=DropboxAccessConfig(), validate_default=True
56
+ )
57
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
58
+
59
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
60
+ @contextmanager
61
+ def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
62
+ with super().get_client(protocol=protocol) as client:
63
+ yield client
64
+
65
+ def wrap_error(self, e: Exception) -> Exception:
66
+ from dropbox.exceptions import AuthError, HttpError, RateLimitError
67
+
68
+ if not isinstance(e, HttpError):
69
+ logger.error(f"unhandled exception from dropbox ({type(e)}): {e}", exc_info=True)
70
+ return e
71
+ if isinstance(e, AuthError):
72
+ raise UserAuthError(e.error)
73
+ if isinstance(e, RateLimitError):
74
+ return CustomRateLimitError(e.error)
75
+ status_code = e.status_code
76
+ if 400 <= status_code < 500:
77
+ if body := getattr(e, "body", None):
78
+ return UserError(body)
79
+ else:
80
+ return UserError(e.body)
81
+ if status_code >= 500:
82
+ if body := getattr(e, "body", None):
83
+ return ProviderError(body)
84
+ else:
85
+ return ProviderError(e.body)
86
+ logger.error(f"unhandled exception from dropbox ({type(e)}): {e}", exc_info=True)
87
+ return e
88
+
89
+
90
+ @dataclass
91
+ class DropboxIndexer(FsspecIndexer):
92
+ connection_config: DropboxConnectionConfig
93
+ index_config: DropboxIndexerConfig
94
+ connector_type: str = CONNECTOR_TYPE
95
+
96
+ def get_path(self, file_data: dict) -> str:
97
+ return file_data["name"]
98
+
99
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
+ path = file_data["name"].lstrip("/")
101
+ date_created = None
102
+ date_modified = None
103
+ server_modified = file_data.get("server_modified")
104
+ client_modified = file_data.get("client_modified")
105
+ if server_modified and client_modified and server_modified > client_modified:
106
+ date_created = str(client_modified.timestamp())
107
+ date_modified = str(server_modified.timestamp())
108
+ elif server_modified and client_modified and server_modified < client_modified:
109
+ date_created = str(server_modified.timestamp())
110
+ date_modified = str(client_modified.timestamp())
111
+
112
+ file_size = file_data.get("size") if "size" in file_data else None
113
+
114
+ version = file_data.get("content_hash")
115
+ record_locator = {
116
+ "protocol": self.index_config.protocol,
117
+ "remote_file_path": self.index_config.remote_url,
118
+ "file_id": file_data.get("id"),
119
+ }
120
+ return FileDataSourceMetadata(
121
+ date_created=date_created,
122
+ date_modified=date_modified,
123
+ date_processed=str(time()),
124
+ version=version,
125
+ url=f"{self.index_config.protocol}://{path}",
126
+ record_locator=record_locator,
127
+ filesize_bytes=file_size,
128
+ )
129
+
130
+
131
+ class DropboxDownloaderConfig(FsspecDownloaderConfig):
132
+ pass
133
+
134
+
135
+ @dataclass
136
+ class DropboxDownloader(FsspecDownloader):
137
+ protocol: str = "dropbox"
138
+ connection_config: DropboxConnectionConfig
139
+ connector_type: str = CONNECTOR_TYPE
140
+ download_config: Optional[DropboxDownloaderConfig] = field(
141
+ default_factory=DropboxDownloaderConfig
142
+ )
143
+
144
+
145
+ class DropboxUploaderConfig(FsspecUploaderConfig):
146
+ pass
147
+
148
+
149
+ @dataclass
150
+ class DropboxUploader(FsspecUploader):
151
+ connector_type: str = CONNECTOR_TYPE
152
+ connection_config: DropboxConnectionConfig
153
+ upload_config: DropboxUploaderConfig = field(default=None)
154
+
155
+
156
+ dropbox_source_entry = SourceRegistryEntry(
157
+ indexer=DropboxIndexer,
158
+ indexer_config=DropboxIndexerConfig,
159
+ downloader=DropboxDownloader,
160
+ downloader_config=DropboxDownloaderConfig,
161
+ connection_config=DropboxConnectionConfig,
162
+ )
163
+
164
+ dropbox_destination_entry = DestinationRegistryEntry(
165
+ uploader=DropboxUploader,
166
+ uploader_config=DropboxUploaderConfig,
167
+ connection_config=DropboxConnectionConfig,
168
+ )