unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,447 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from time import time
8
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Generator, Iterator, Optional, TypeVar
9
+
10
+ from dateutil import parser
11
+ from pydantic import Field, Secret
12
+
13
+ from unstructured_ingest.error import (
14
+ DestinationConnectionError,
15
+ SourceConnectionError,
16
+ SourceConnectionNetworkError,
17
+ )
18
+ from unstructured_ingest.utils.dep_check import requires_dependencies
19
+ from unstructured_ingest.v2.interfaces import (
20
+ AccessConfig,
21
+ ConnectionConfig,
22
+ Downloader,
23
+ DownloaderConfig,
24
+ DownloadResponse,
25
+ FileData,
26
+ FileDataSourceMetadata,
27
+ Indexer,
28
+ IndexerConfig,
29
+ SourceIdentifiers,
30
+ Uploader,
31
+ UploaderConfig,
32
+ )
33
+ from unstructured_ingest.v2.logger import logger
34
+ from unstructured_ingest.v2.processes.connector_registry import (
35
+ DestinationRegistryEntry,
36
+ SourceRegistryEntry,
37
+ )
38
+
39
+ if TYPE_CHECKING:
40
+ from office365.graph_client import GraphClient
41
+ from office365.onedrive.driveitems.driveItem import DriveItem
42
+ from office365.onedrive.drives.drive import Drive
43
+
44
+ CONNECTOR_TYPE = "onedrive"
45
+ MAX_MB_SIZE = 512_000_000
46
+
47
+
48
+ class OnedriveAccessConfig(AccessConfig):
49
+ client_cred: str = Field(description="Microsoft App client secret")
50
+
51
+
52
+ class OnedriveConnectionConfig(ConnectionConfig):
53
+ client_id: str = Field(description="Microsoft app client ID")
54
+ user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
55
+ tenant: str = Field(
56
+ repr=False, description="ID or domain name associated with your Azure AD instance"
57
+ )
58
+ authority_url: Optional[str] = Field(
59
+ repr=False,
60
+ default="https://login.microsoftonline.com",
61
+ examples=["https://login.microsoftonline.com"],
62
+ description="Authentication token provider for Microsoft apps",
63
+ )
64
+ access_config: Secret[OnedriveAccessConfig]
65
+
66
+ def get_drive(self) -> "Drive":
67
+ client = self.get_client()
68
+ drive = client.users[self.user_pname].drive
69
+ return drive
70
+
71
+ @requires_dependencies(["msal"], extras="onedrive")
72
+ def get_token(self):
73
+ from msal import ConfidentialClientApplication
74
+
75
+ try:
76
+ app = ConfidentialClientApplication(
77
+ authority=f"{self.authority_url}/{self.tenant}",
78
+ client_id=self.client_id,
79
+ client_credential=self.access_config.get_secret_value().client_cred,
80
+ )
81
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
82
+ except ValueError as exc:
83
+ logger.error("Couldn't set up credentials for OneDrive")
84
+ raise exc
85
+ if "error" in token:
86
+ raise SourceConnectionNetworkError(
87
+ "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
88
+ )
89
+ return token
90
+
91
+ @requires_dependencies(["office365"], extras="onedrive")
92
+ def get_client(self) -> "GraphClient":
93
+ from office365.graph_client import GraphClient
94
+
95
+ client = GraphClient(self.get_token)
96
+ return client
97
+
98
+
99
+ class OnedriveIndexerConfig(IndexerConfig):
100
+ path: Optional[str] = Field(default="")
101
+ recursive: bool = False
102
+
103
+
104
+ T = TypeVar("T")
105
+
106
+
107
+ def async_iterable_to_sync_iterable(iterator: AsyncIterator[T]) -> Iterator[T]:
108
+ # This version works on Python 3.9 by manually handling the async iteration.
109
+ loop = asyncio.new_event_loop()
110
+ asyncio.set_event_loop(loop)
111
+ try:
112
+ while True:
113
+ try:
114
+ # Instead of anext(iterator), we directly call __anext__().
115
+ # __anext__ returns a coroutine that we must run until complete.
116
+ future = iterator.__anext__()
117
+ result = loop.run_until_complete(future)
118
+ yield result
119
+ except StopAsyncIteration:
120
+ break
121
+ finally:
122
+ loop.close()
123
+
124
+
125
+ @dataclass
126
+ class OnedriveIndexer(Indexer):
127
+ connection_config: OnedriveConnectionConfig
128
+ index_config: OnedriveIndexerConfig
129
+
130
+ def precheck(self) -> None:
131
+ try:
132
+ token_resp: dict = self.connection_config.get_token()
133
+ if error := token_resp.get("error"):
134
+ raise SourceConnectionError(
135
+ "{} ({})".format(error, token_resp.get("error_description"))
136
+ )
137
+ except Exception as e:
138
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
139
+ raise SourceConnectionError(f"failed to validate connection: {e}")
140
+
141
+ def list_objects_sync(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
142
+ drive_items = folder.children.get().execute_query()
143
+ files = [d for d in drive_items if d.is_file]
144
+ if not recursive:
145
+ return files
146
+
147
+ folders = [d for d in drive_items if d.is_folder]
148
+ for f in folders:
149
+ files.extend(self.list_objects_sync(f, recursive))
150
+ return files
151
+
152
+ async def list_objects(self, folder: "DriveItem", recursive: bool) -> list["DriveItem"]:
153
+ return await asyncio.to_thread(self.list_objects_sync, folder, recursive)
154
+
155
+ def get_root_sync(self, client: "GraphClient") -> "DriveItem":
156
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
157
+ if fpath := self.index_config.path:
158
+ root = root.get_by_path(fpath).get().execute_query()
159
+ if root is None or not root.is_folder:
160
+ raise ValueError(f"Unable to find directory, given: {fpath}")
161
+ return root
162
+
163
+ async def get_root(self, client: "GraphClient") -> "DriveItem":
164
+ return await asyncio.to_thread(self.get_root_sync, client)
165
+
166
+ def get_properties_sync(self, drive_item: "DriveItem") -> dict:
167
+ properties = drive_item.properties
168
+ filtered_properties = {}
169
+ for k, v in properties.items():
170
+ try:
171
+ json.dumps(v)
172
+ filtered_properties[k] = v
173
+ except TypeError:
174
+ pass
175
+ return filtered_properties
176
+
177
+ async def get_properties(self, drive_item: "DriveItem") -> dict:
178
+ return await asyncio.to_thread(self.get_properties_sync, drive_item)
179
+
180
+ def drive_item_to_file_data_sync(self, drive_item: "DriveItem") -> FileData:
181
+ file_path = drive_item.parent_reference.path.split(":")[-1]
182
+ file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path
183
+ filename = drive_item.name
184
+ server_path = file_path + "/" + filename
185
+ rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
186
+ date_modified_dt = (
187
+ parser.parse(str(drive_item.last_modified_datetime))
188
+ if drive_item.last_modified_datetime
189
+ else None
190
+ )
191
+ date_created_at = (
192
+ parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
193
+ )
194
+ return FileData(
195
+ identifier=drive_item.id,
196
+ connector_type=CONNECTOR_TYPE,
197
+ source_identifiers=SourceIdentifiers(
198
+ fullpath=server_path, filename=drive_item.name, rel_path=rel_path
199
+ ),
200
+ metadata=FileDataSourceMetadata(
201
+ url=drive_item.parent_reference.path + "/" + drive_item.name,
202
+ version=drive_item.etag,
203
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
204
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
205
+ date_processed=str(time()),
206
+ record_locator={
207
+ "user_pname": self.connection_config.user_pname,
208
+ "server_relative_path": server_path,
209
+ },
210
+ ),
211
+ additional_metadata=self.get_properties_sync(drive_item=drive_item),
212
+ )
213
+
214
+ async def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData:
215
+ # Offload the file data creation if it's not guaranteed async
216
+ return await asyncio.to_thread(self.drive_item_to_file_data_sync, drive_item)
217
+
218
+ async def _run_async(self, **kwargs: Any) -> AsyncIterator[FileData]:
219
+ token_resp = await asyncio.to_thread(self.connection_config.get_token)
220
+ if "error" in token_resp:
221
+ raise SourceConnectionError(
222
+ f"[{CONNECTOR_TYPE}]: {token_resp['error']} ({token_resp.get('error_description')})"
223
+ )
224
+
225
+ client = await asyncio.to_thread(self.connection_config.get_client)
226
+ root = await self.get_root(client=client)
227
+ drive_items = await self.list_objects(folder=root, recursive=self.index_config.recursive)
228
+
229
+ for drive_item in drive_items:
230
+ file_data = await self.drive_item_to_file_data(drive_item=drive_item)
231
+ yield file_data
232
+
233
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
234
+ # Convert the async generator to a sync generator without loading all data into memory
235
+ async_gen = self._run_async(**kwargs)
236
+ for item in async_iterable_to_sync_iterable(async_gen):
237
+ yield item
238
+
239
+
240
+ class OnedriveDownloaderConfig(DownloaderConfig):
241
+ pass
242
+
243
+
244
+ @dataclass
245
+ class OnedriveDownloader(Downloader):
246
+ connection_config: OnedriveConnectionConfig
247
+ download_config: OnedriveDownloaderConfig
248
+
249
+ @SourceConnectionNetworkError.wrap
250
+ def _fetch_file(self, file_data: FileData):
251
+ if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath:
252
+ raise ValueError(
253
+ f"file data doesn't have enough information to get "
254
+ f"file content: {file_data.model_dump()}"
255
+ )
256
+
257
+ server_relative_path = file_data.source_identifiers.fullpath
258
+ client = self.connection_config.get_client()
259
+ root = client.users[self.connection_config.user_pname].drive.get().execute_query().root
260
+ file = root.get_by_path(server_relative_path).get().execute_query()
261
+ if not file:
262
+ raise FileNotFoundError(f"file not found: {server_relative_path}")
263
+ return file
264
+
265
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
266
+ rel_path = file_data.source_identifiers.relative_path
267
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
268
+ return self.download_dir / Path(rel_path)
269
+
270
+ @SourceConnectionError.wrap
271
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
272
+ try:
273
+ file = self._fetch_file(file_data=file_data)
274
+ fsize = file.get_property("size", 0)
275
+ download_path = self.get_download_path(file_data=file_data)
276
+ download_path.parent.mkdir(parents=True, exist_ok=True)
277
+ logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
278
+ if fsize > MAX_MB_SIZE:
279
+ logger.info(f"downloading file with size: {fsize} bytes in chunks")
280
+ with download_path.open(mode="wb") as f:
281
+ file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
282
+ else:
283
+ with download_path.open(mode="wb") as f:
284
+ file.download(f).execute_query()
285
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
286
+ except Exception as e:
287
+ logger.error(f"[{CONNECTOR_TYPE}] Exception during downloading: {e}", exc_info=True)
288
+ # Re-raise to see full stack trace locally
289
+ raise
290
+
291
+
292
+ class OnedriveUploaderConfig(UploaderConfig):
293
+ remote_url: str = Field(
294
+ description="URL of the destination in OneDrive, e.g., 'onedrive://Documents/Folder'"
295
+ )
296
+ prefix: str = "onedrive://"
297
+
298
+ @property
299
+ def root_folder(self) -> str:
300
+ url = (
301
+ self.remote_url.replace(self.prefix, "", 1)
302
+ if self.remote_url.startswith(self.prefix)
303
+ else self.remote_url
304
+ )
305
+ return url.split("/")[0]
306
+
307
+ @property
308
+ def url(self) -> str:
309
+ url = (
310
+ self.remote_url.replace(self.prefix, "", 1)
311
+ if self.remote_url.startswith(self.prefix)
312
+ else self.remote_url
313
+ )
314
+ return url
315
+
316
+
317
+ @dataclass
318
+ class OnedriveUploader(Uploader):
319
+ connection_config: OnedriveConnectionConfig
320
+ upload_config: OnedriveUploaderConfig
321
+ connector_type: str = CONNECTOR_TYPE
322
+
323
+ @requires_dependencies(["office365"], extras="onedrive")
324
+ def precheck(self) -> None:
325
+ from office365.runtime.client_request_exception import ClientRequestException
326
+
327
+ try:
328
+ token_resp: dict = self.connection_config.get_token()
329
+ if error := token_resp.get("error"):
330
+ raise SourceConnectionError(
331
+ "{} ({})".format(error, token_resp.get("error_description"))
332
+ )
333
+ drive = self.connection_config.get_drive()
334
+ root = drive.root
335
+ root_folder = self.upload_config.root_folder
336
+ folder = root.get_by_path(root_folder)
337
+ try:
338
+ folder.get().execute_query()
339
+ except ClientRequestException as e:
340
+ if e.message != "The resource could not be found.":
341
+ raise e
342
+ folder = root.create_folder(root_folder).execute_query()
343
+ logger.info(f"successfully created folder: {folder.name}")
344
+ except Exception as e:
345
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
346
+ raise SourceConnectionError(f"failed to validate connection: {e}")
347
+
348
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
349
+ drive = self.connection_config.get_drive()
350
+
351
+ # Use the remote_url from upload_config as the base destination folder
352
+ base_destination_folder = self.upload_config.url
353
+
354
+ # Use the file's relative path to maintain directory structure, if needed
355
+ if file_data.source_identifiers and file_data.source_identifiers.rel_path:
356
+ # Combine the base destination folder with the file's relative path
357
+ destination_path = Path(base_destination_folder) / Path(
358
+ file_data.source_identifiers.rel_path
359
+ )
360
+ else:
361
+ # If no relative path is provided, upload directly to the base destination folder
362
+ destination_path = Path(base_destination_folder) / path.name
363
+
364
+ destination_folder = destination_path.parent
365
+ file_name = destination_path.name
366
+
367
+ # Convert destination folder to a string suitable for OneDrive API
368
+ destination_folder_str = str(destination_folder).replace("\\", "/")
369
+
370
+ # Resolve the destination folder in OneDrive, creating it if necessary
371
+ try:
372
+ # Attempt to get the folder
373
+ folder = drive.root.get_by_path(destination_folder_str)
374
+ folder.get().execute_query()
375
+ except Exception:
376
+ # Folder doesn't exist, create it recursively
377
+ current_folder = drive.root
378
+ for part in destination_folder.parts:
379
+ # Use filter to find the folder by name
380
+ folders = (
381
+ current_folder.children.filter(f"name eq '{part}' and folder ne null")
382
+ .get()
383
+ .execute_query()
384
+ )
385
+ if folders:
386
+ current_folder = folders[0]
387
+ else:
388
+ # Folder doesn't exist, create it
389
+ current_folder = current_folder.create_folder(part).execute_query()
390
+ folder = current_folder
391
+
392
+ # Check the size of the file
393
+ file_size = path.stat().st_size
394
+
395
+ if file_size < MAX_MB_SIZE:
396
+ # Use simple upload for small files
397
+ with path.open("rb") as local_file:
398
+ content = local_file.read()
399
+ logger.info(f"Uploading {path} to {destination_path} using simple upload")
400
+ try:
401
+ uploaded_file = folder.upload(file_name, content).execute_query()
402
+ if not uploaded_file or uploaded_file.name != file_name:
403
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
404
+ # Log details about the uploaded file
405
+ logger.info(
406
+ f"Uploaded file '{uploaded_file.name}' with ID '{uploaded_file.id}'"
407
+ )
408
+ except Exception as e:
409
+ logger.error(f"Failed to upload file '{file_name}': {e}", exc_info=True)
410
+ raise DestinationConnectionError(
411
+ f"Failed to upload file '{file_name}': {e}"
412
+ ) from e
413
+ else:
414
+ # Use resumable upload for large files
415
+ destination_fullpath = f"{destination_folder_str}/{file_name}"
416
+ destination_drive_item = drive.root.item_with_path(destination_fullpath)
417
+
418
+ logger.info(f"Uploading {path} to {destination_fullpath} using resumable upload")
419
+ try:
420
+ uploaded_file = destination_drive_item.resumable_upload(
421
+ source_path=str(path)
422
+ ).execute_query()
423
+ # Validate the upload
424
+ if not uploaded_file or uploaded_file.name != file_name:
425
+ raise DestinationConnectionError(f"Upload failed for file '{file_name}'")
426
+ # Log details about the uploaded file
427
+ logger.info(f"Uploaded file {uploaded_file.name} with ID {uploaded_file.id}")
428
+ except Exception as e:
429
+ logger.error(f"Failed to upload file '{file_name}' using resumable upload: {e}")
430
+ raise DestinationConnectionError(
431
+ f"Failed to upload file '{file_name}' using resumable upload: {e}"
432
+ ) from e
433
+
434
+
435
+ onedrive_source_entry = SourceRegistryEntry(
436
+ connection_config=OnedriveConnectionConfig,
437
+ indexer_config=OnedriveIndexerConfig,
438
+ indexer=OnedriveIndexer,
439
+ downloader_config=OnedriveDownloaderConfig,
440
+ downloader=OnedriveDownloader,
441
+ )
442
+
443
+ onedrive_destination_entry = DestinationRegistryEntry(
444
+ connection_config=OnedriveConnectionConfig,
445
+ uploader=OnedriveUploader,
446
+ uploader_config=OnedriveUploaderConfig,
447
+ )
@@ -0,0 +1,239 @@
1
+ import hashlib
2
+ import time
3
+ from dataclasses import dataclass, field
4
+ from datetime import timezone
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any, Coroutine, Generator
7
+
8
+ from pydantic import Field, Secret
9
+
10
+ from unstructured_ingest.error import SourceConnectionError
11
+ from unstructured_ingest.logger import logger
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ FileData,
20
+ Indexer,
21
+ IndexerConfig,
22
+ )
23
+ from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
24
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
25
+
26
+ MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
27
+
28
+ if TYPE_CHECKING:
29
+ from office365.graph_client import GraphClient
30
+ from office365.outlook.mail.folders.folder import MailFolder
31
+ from office365.outlook.mail.messages.message import Message
32
+
33
+
34
+ CONNECTOR_TYPE = "outlook"
35
+
36
+
37
+ class OutlookAccessConfig(AccessConfig):
38
+ client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
39
+
40
+
41
+ class OutlookConnectionConfig(ConnectionConfig):
42
+ access_config: Secret[OutlookAccessConfig]
43
+ client_id: str = Field(description="Azure AD App client ID")
44
+ tenant: str = Field(
45
+ default="common", description="ID or domain name associated with your Azure AD instance"
46
+ )
47
+ authority_url: str = Field(
48
+ default="https://login.microsoftonline.com",
49
+ description="Authentication token provider for Microsoft apps",
50
+ )
51
+
52
+ @requires_dependencies(["msal"], extras="outlook")
53
+ def _acquire_token(self):
54
+ """Acquire token via MSAL"""
55
+ from msal import ConfidentialClientApplication
56
+
57
+ # NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
58
+ # constant as default in the future but they do not fit well with `authority_url` right now
59
+ authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
60
+ app = ConfidentialClientApplication(
61
+ authority=authority_url,
62
+ client_id=self.client_id,
63
+ client_credential=self.access_config.get_secret_value().client_credential,
64
+ )
65
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
66
+ return token
67
+
68
+ @requires_dependencies(["office365"], extras="outlook")
69
+ @SourceConnectionError.wrap
70
+ def get_client(self) -> "GraphClient":
71
+ from office365.graph_client import GraphClient
72
+
73
+ return GraphClient(self._acquire_token)
74
+
75
+
76
+ class OutlookIndexerConfig(IndexerConfig):
77
+ outlook_folders: list[str] = Field(
78
+ description="Folders to download email messages from. Do not specify subfolders. "
79
+ "Use quotes if there are spaces in folder names."
80
+ )
81
+ recursive: bool = Field(
82
+ default=False,
83
+ description="Recursively download files in their respective folders otherwise stop at the"
84
+ " files in provided folder level.",
85
+ )
86
+ user_email: str = Field(description="Outlook email to download messages from.")
87
+
88
+
89
+ @dataclass
90
+ class OutlookIndexer(Indexer):
91
+ index_config: OutlookIndexerConfig
92
+ connection_config: OutlookConnectionConfig
93
+ connector_type: str = CONNECTOR_TYPE
94
+
95
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
96
+ messages = self._list_messages(recursive=self.index_config.recursive)
97
+
98
+ for message in messages:
99
+ yield self._message_to_file_data(message)
100
+
101
+ def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
102
+ raise NotImplementedError
103
+
104
+ @SourceConnectionError.wrap
105
+ def precheck(self) -> None:
106
+ client = self.connection_config.get_client()
107
+ client.users[self.index_config.user_email].get().execute_query()
108
+
109
+ def is_async(self) -> bool:
110
+ return False
111
+
112
+ def _list_messages(self, recursive: bool) -> list["Message"]:
113
+ mail_folders = self._get_selected_root_folders()
114
+ messages = []
115
+
116
+ while mail_folders:
117
+ mail_folder = mail_folders.pop()
118
+ messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
119
+
120
+ if recursive:
121
+ mail_folders += list(mail_folder.child_folders.get().execute_query())
122
+
123
+ return messages
124
+
125
+ def _get_selected_root_folders(self) -> list["MailFolder"]:
126
+ client_user = self.connection_config.get_client().users[self.index_config.user_email]
127
+ root_mail_folders = client_user.mail_folders.get().execute_query()
128
+
129
+ selected_names_normalized = [
130
+ folder_name.lower() for folder_name in self.index_config.outlook_folders
131
+ ]
132
+ selected_root_mail_folders = [
133
+ folder
134
+ for folder in root_mail_folders
135
+ if folder.display_name.lower() in selected_names_normalized
136
+ ]
137
+
138
+ if not selected_root_mail_folders:
139
+ logger.error(
140
+ f"Root folders selected in configuration: {self.index_config.outlook_folders}"
141
+ f"not found for user email {self.index_config.user_email}. Aborting."
142
+ )
143
+ raise ValueError("Root folders selected in configuration not found.")
144
+
145
+ return selected_root_mail_folders
146
+
147
+ def _message_to_file_data(self, message: "Message") -> FileData:
148
+ fullpath = self._generate_fullpath(message)
149
+
150
+ return FileData(
151
+ identifier=message.id,
152
+ connector_type=CONNECTOR_TYPE,
153
+ source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
154
+ metadata=FileDataSourceMetadata(
155
+ url=message.resource_url,
156
+ version=message.change_key,
157
+ date_modified=str(
158
+ message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
159
+ ),
160
+ date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
161
+ date_processed=str(time.time()),
162
+ record_locator={
163
+ "message_id": message.id,
164
+ "user_email": self.index_config.user_email,
165
+ },
166
+ ),
167
+ additional_metadata={
168
+ "sent_from": str(message.sent_from),
169
+ "to_recipients": [str(recipient) for recipient in message.to_recipients],
170
+ "bcc_recipients": [str(recipient) for recipient in message.to_recipients],
171
+ "subject": message.subject,
172
+ "conversation_id": message.conversation_id,
173
+ "is_draft": message.is_draft,
174
+ "is_read": message.is_read,
175
+ "has_attachments": message.has_attachments,
176
+ "importance": message.importance,
177
+ },
178
+ )
179
+
180
+ def _generate_fullpath(self, message: "Message") -> Path:
181
+ return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
182
+
183
+
184
+ class OutlookDownloaderConfig(DownloaderConfig):
185
+ pass
186
+
187
+
188
+ @dataclass
189
+ class OutlookDownloader(Downloader):
190
+ connector_type: str = CONNECTOR_TYPE
191
+ connection_config: OutlookConnectionConfig
192
+ download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
193
+
194
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
195
+ # NOTE: Indexer should provide source identifiers required to generate the download path
196
+ download_path = self.get_download_path(file_data)
197
+ if download_path is None:
198
+ logger.error(
199
+ "Generated download path is None, source_identifiers might be missing"
200
+ "from FileData."
201
+ )
202
+ raise ValueError("Generated invalid download path.")
203
+
204
+ self._download_message(file_data, download_path)
205
+ return self.generate_download_response(file_data, download_path)
206
+
207
+ def is_async(self) -> bool:
208
+ return False
209
+
210
+ def _download_message(self, file_data: FileData, download_path: Path) -> None:
211
+ # NOTE: Indexer should supply the record locator in metadata
212
+ if (
213
+ file_data.metadata.record_locator is None
214
+ or "user_email" not in file_data.metadata.record_locator
215
+ or "message_id" not in file_data.metadata.record_locator
216
+ ):
217
+ logger.error(
218
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
219
+ "Keys 'user_email' and 'message_id' must be present."
220
+ )
221
+ raise ValueError("Invalid record locator.")
222
+
223
+ user_email = file_data.metadata.record_locator["user_email"]
224
+ message_id = file_data.metadata.record_locator["message_id"]
225
+
226
+ message = self.connection_config.get_client().users[user_email].messages[message_id]
227
+ download_path.parent.mkdir(exist_ok=True, parents=True)
228
+
229
+ with open(download_path, "wb") as file:
230
+ message.download(file).execute_query()
231
+
232
+
233
+ outlook_source_entry = SourceRegistryEntry(
234
+ indexer=OutlookIndexer,
235
+ indexer_config=OutlookIndexerConfig,
236
+ downloader=OutlookDownloader,
237
+ downloader_config=OutlookDownloaderConfig,
238
+ connection_config=OutlookConnectionConfig,
239
+ )