unstructured-ingest 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (557) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +31 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +38 -0
  7. test/integration/connectors/databricks/__init__.py +0 -0
  8. test/integration/connectors/databricks/test_volumes_native.py +269 -0
  9. test/integration/connectors/discord/__init__.py +0 -0
  10. test/integration/connectors/discord/test_discord.py +90 -0
  11. test/integration/connectors/duckdb/__init__.py +0 -0
  12. test/integration/connectors/duckdb/conftest.py +14 -0
  13. test/integration/connectors/duckdb/test_duckdb.py +89 -0
  14. test/integration/connectors/duckdb/test_motherduck.py +95 -0
  15. test/integration/connectors/elasticsearch/__init__.py +0 -0
  16. test/integration/connectors/elasticsearch/conftest.py +34 -0
  17. test/integration/connectors/elasticsearch/test_elasticsearch.py +330 -0
  18. test/integration/connectors/elasticsearch/test_opensearch.py +325 -0
  19. test/integration/connectors/sql/__init__.py +0 -0
  20. test/integration/connectors/sql/test_postgres.py +195 -0
  21. test/integration/connectors/sql/test_singlestore.py +176 -0
  22. test/integration/connectors/sql/test_snowflake.py +238 -0
  23. test/integration/connectors/sql/test_sqlite.py +162 -0
  24. test/integration/connectors/test_astradb.py +217 -0
  25. test/integration/connectors/test_azure_ai_search.py +255 -0
  26. test/integration/connectors/test_chroma.py +120 -0
  27. test/integration/connectors/test_confluence.py +113 -0
  28. test/integration/connectors/test_delta_table.py +185 -0
  29. test/integration/connectors/test_lancedb.py +247 -0
  30. test/integration/connectors/test_milvus.py +203 -0
  31. test/integration/connectors/test_mongodb.py +335 -0
  32. test/integration/connectors/test_neo4j.py +236 -0
  33. test/integration/connectors/test_notion.py +145 -0
  34. test/integration/connectors/test_onedrive.py +118 -0
  35. test/integration/connectors/test_pinecone.py +288 -0
  36. test/integration/connectors/test_qdrant.py +215 -0
  37. test/integration/connectors/test_redis.py +119 -0
  38. test/integration/connectors/test_s3.py +183 -0
  39. test/integration/connectors/test_vectara.py +270 -0
  40. test/integration/connectors/utils/__init__.py +0 -0
  41. test/integration/connectors/utils/constants.py +7 -0
  42. test/integration/connectors/utils/docker.py +151 -0
  43. test/integration/connectors/utils/docker_compose.py +59 -0
  44. test/integration/connectors/utils/validation/__init__.py +0 -0
  45. test/integration/connectors/utils/validation/destination.py +75 -0
  46. test/integration/connectors/utils/validation/equality.py +75 -0
  47. test/integration/connectors/utils/validation/source.py +299 -0
  48. test/integration/connectors/utils/validation/utils.py +36 -0
  49. test/integration/connectors/weaviate/__init__.py +0 -0
  50. test/integration/connectors/weaviate/conftest.py +15 -0
  51. test/integration/connectors/weaviate/test_cloud.py +34 -0
  52. test/integration/connectors/weaviate/test_local.py +131 -0
  53. test/integration/embedders/__init__.py +0 -0
  54. test/integration/embedders/conftest.py +13 -0
  55. test/integration/embedders/test_azure_openai.py +59 -0
  56. test/integration/embedders/test_bedrock.py +103 -0
  57. test/integration/embedders/test_huggingface.py +26 -0
  58. test/integration/embedders/test_mixedbread.py +71 -0
  59. test/integration/embedders/test_octoai.py +77 -0
  60. test/integration/embedders/test_openai.py +76 -0
  61. test/integration/embedders/test_togetherai.py +71 -0
  62. test/integration/embedders/test_vertexai.py +65 -0
  63. test/integration/embedders/test_voyageai.py +65 -0
  64. test/integration/embedders/utils.py +68 -0
  65. test/integration/partitioners/__init__.py +0 -0
  66. test/integration/partitioners/test_partitioner.py +75 -0
  67. test/integration/utils.py +15 -0
  68. test/unit/__init__.py +0 -0
  69. test/unit/embed/__init__.py +0 -0
  70. test/unit/embed/test_mixedbreadai.py +42 -0
  71. test/unit/embed/test_octoai.py +27 -0
  72. test/unit/embed/test_openai.py +20 -0
  73. test/unit/embed/test_vertexai.py +25 -0
  74. test/unit/embed/test_voyageai.py +24 -0
  75. test/unit/test_error.py +27 -0
  76. test/unit/test_logger.py +78 -0
  77. test/unit/test_utils.py +184 -0
  78. test/unit/v2/__init__.py +0 -0
  79. test/unit/v2/chunkers/__init__.py +0 -0
  80. test/unit/v2/chunkers/test_chunkers.py +49 -0
  81. test/unit/v2/connectors/__init__.py +0 -0
  82. test/unit/v2/connectors/test_confluence.py +39 -0
  83. test/unit/v2/embedders/__init__.py +0 -0
  84. test/unit/v2/embedders/test_bedrock.py +36 -0
  85. test/unit/v2/embedders/test_huggingface.py +48 -0
  86. test/unit/v2/embedders/test_mixedbread.py +37 -0
  87. test/unit/v2/embedders/test_octoai.py +35 -0
  88. test/unit/v2/embedders/test_openai.py +35 -0
  89. test/unit/v2/embedders/test_togetherai.py +37 -0
  90. test/unit/v2/embedders/test_vertexai.py +37 -0
  91. test/unit/v2/embedders/test_voyageai.py +38 -0
  92. test/unit/v2/partitioners/__init__.py +0 -0
  93. test/unit/v2/partitioners/test_partitioner.py +63 -0
  94. test/unit/v2/test_interfaces.py +26 -0
  95. test/unit/v2/test_utils.py +82 -0
  96. test/unit/v2/utils/__init__.py +0 -0
  97. test/unit/v2/utils/data_generator.py +32 -0
  98. unstructured_ingest/__init__.py +1 -0
  99. unstructured_ingest/__version__.py +1 -0
  100. unstructured_ingest/cli/__init__.py +14 -0
  101. unstructured_ingest/cli/base/__init__.py +0 -0
  102. unstructured_ingest/cli/base/cmd.py +19 -0
  103. unstructured_ingest/cli/base/dest.py +87 -0
  104. unstructured_ingest/cli/base/src.py +57 -0
  105. unstructured_ingest/cli/cli.py +37 -0
  106. unstructured_ingest/cli/cmd_factory.py +12 -0
  107. unstructured_ingest/cli/cmds/__init__.py +145 -0
  108. unstructured_ingest/cli/cmds/airtable.py +69 -0
  109. unstructured_ingest/cli/cmds/astradb.py +99 -0
  110. unstructured_ingest/cli/cmds/azure_ai_search.py +65 -0
  111. unstructured_ingest/cli/cmds/biomed.py +52 -0
  112. unstructured_ingest/cli/cmds/chroma.py +104 -0
  113. unstructured_ingest/cli/cmds/clarifai.py +71 -0
  114. unstructured_ingest/cli/cmds/confluence.py +69 -0
  115. unstructured_ingest/cli/cmds/databricks_volumes.py +163 -0
  116. unstructured_ingest/cli/cmds/delta_table.py +94 -0
  117. unstructured_ingest/cli/cmds/discord.py +47 -0
  118. unstructured_ingest/cli/cmds/elasticsearch.py +133 -0
  119. unstructured_ingest/cli/cmds/fsspec/__init__.py +0 -0
  120. unstructured_ingest/cli/cmds/fsspec/azure.py +94 -0
  121. unstructured_ingest/cli/cmds/fsspec/box.py +48 -0
  122. unstructured_ingest/cli/cmds/fsspec/dropbox.py +51 -0
  123. unstructured_ingest/cli/cmds/fsspec/fsspec.py +15 -0
  124. unstructured_ingest/cli/cmds/fsspec/gcs.py +71 -0
  125. unstructured_ingest/cli/cmds/fsspec/s3.py +74 -0
  126. unstructured_ingest/cli/cmds/fsspec/sftp.py +58 -0
  127. unstructured_ingest/cli/cmds/github.py +54 -0
  128. unstructured_ingest/cli/cmds/gitlab.py +54 -0
  129. unstructured_ingest/cli/cmds/google_drive.py +49 -0
  130. unstructured_ingest/cli/cmds/hubspot.py +70 -0
  131. unstructured_ingest/cli/cmds/jira.py +71 -0
  132. unstructured_ingest/cli/cmds/kafka.py +102 -0
  133. unstructured_ingest/cli/cmds/local.py +43 -0
  134. unstructured_ingest/cli/cmds/mongodb.py +72 -0
  135. unstructured_ingest/cli/cmds/notion.py +48 -0
  136. unstructured_ingest/cli/cmds/onedrive.py +66 -0
  137. unstructured_ingest/cli/cmds/opensearch.py +117 -0
  138. unstructured_ingest/cli/cmds/outlook.py +67 -0
  139. unstructured_ingest/cli/cmds/pinecone.py +71 -0
  140. unstructured_ingest/cli/cmds/qdrant.py +124 -0
  141. unstructured_ingest/cli/cmds/reddit.py +67 -0
  142. unstructured_ingest/cli/cmds/salesforce.py +58 -0
  143. unstructured_ingest/cli/cmds/sharepoint.py +66 -0
  144. unstructured_ingest/cli/cmds/slack.py +56 -0
  145. unstructured_ingest/cli/cmds/sql.py +66 -0
  146. unstructured_ingest/cli/cmds/vectara.py +66 -0
  147. unstructured_ingest/cli/cmds/weaviate.py +98 -0
  148. unstructured_ingest/cli/cmds/wikipedia.py +40 -0
  149. unstructured_ingest/cli/common.py +7 -0
  150. unstructured_ingest/cli/interfaces.py +663 -0
  151. unstructured_ingest/cli/utils.py +205 -0
  152. unstructured_ingest/connector/__init__.py +0 -0
  153. unstructured_ingest/connector/airtable.py +309 -0
  154. unstructured_ingest/connector/astradb.py +267 -0
  155. unstructured_ingest/connector/azure_ai_search.py +144 -0
  156. unstructured_ingest/connector/biomed.py +320 -0
  157. unstructured_ingest/connector/chroma.py +158 -0
  158. unstructured_ingest/connector/clarifai.py +122 -0
  159. unstructured_ingest/connector/confluence.py +285 -0
  160. unstructured_ingest/connector/databricks_volumes.py +137 -0
  161. unstructured_ingest/connector/delta_table.py +203 -0
  162. unstructured_ingest/connector/discord.py +180 -0
  163. unstructured_ingest/connector/elasticsearch.py +396 -0
  164. unstructured_ingest/connector/fsspec/__init__.py +0 -0
  165. unstructured_ingest/connector/fsspec/azure.py +78 -0
  166. unstructured_ingest/connector/fsspec/box.py +109 -0
  167. unstructured_ingest/connector/fsspec/dropbox.py +160 -0
  168. unstructured_ingest/connector/fsspec/fsspec.py +359 -0
  169. unstructured_ingest/connector/fsspec/gcs.py +82 -0
  170. unstructured_ingest/connector/fsspec/s3.py +62 -0
  171. unstructured_ingest/connector/fsspec/sftp.py +81 -0
  172. unstructured_ingest/connector/git.py +124 -0
  173. unstructured_ingest/connector/github.py +174 -0
  174. unstructured_ingest/connector/gitlab.py +142 -0
  175. unstructured_ingest/connector/google_drive.py +348 -0
  176. unstructured_ingest/connector/hubspot.py +278 -0
  177. unstructured_ingest/connector/jira.py +469 -0
  178. unstructured_ingest/connector/kafka.py +293 -0
  179. unstructured_ingest/connector/local.py +139 -0
  180. unstructured_ingest/connector/mongodb.py +284 -0
  181. unstructured_ingest/connector/notion/__init__.py +0 -0
  182. unstructured_ingest/connector/notion/client.py +248 -0
  183. unstructured_ingest/connector/notion/connector.py +469 -0
  184. unstructured_ingest/connector/notion/helpers.py +584 -0
  185. unstructured_ingest/connector/notion/interfaces.py +32 -0
  186. unstructured_ingest/connector/notion/types/__init__.py +0 -0
  187. unstructured_ingest/connector/notion/types/block.py +96 -0
  188. unstructured_ingest/connector/notion/types/blocks/__init__.py +63 -0
  189. unstructured_ingest/connector/notion/types/blocks/bookmark.py +40 -0
  190. unstructured_ingest/connector/notion/types/blocks/breadcrumb.py +21 -0
  191. unstructured_ingest/connector/notion/types/blocks/bulleted_list_item.py +31 -0
  192. unstructured_ingest/connector/notion/types/blocks/callout.py +94 -0
  193. unstructured_ingest/connector/notion/types/blocks/child_database.py +23 -0
  194. unstructured_ingest/connector/notion/types/blocks/child_page.py +23 -0
  195. unstructured_ingest/connector/notion/types/blocks/code.py +43 -0
  196. unstructured_ingest/connector/notion/types/blocks/column_list.py +35 -0
  197. unstructured_ingest/connector/notion/types/blocks/divider.py +22 -0
  198. unstructured_ingest/connector/notion/types/blocks/embed.py +36 -0
  199. unstructured_ingest/connector/notion/types/blocks/equation.py +23 -0
  200. unstructured_ingest/connector/notion/types/blocks/file.py +49 -0
  201. unstructured_ingest/connector/notion/types/blocks/heading.py +37 -0
  202. unstructured_ingest/connector/notion/types/blocks/image.py +21 -0
  203. unstructured_ingest/connector/notion/types/blocks/link_preview.py +24 -0
  204. unstructured_ingest/connector/notion/types/blocks/link_to_page.py +29 -0
  205. unstructured_ingest/connector/notion/types/blocks/numbered_list.py +29 -0
  206. unstructured_ingest/connector/notion/types/blocks/paragraph.py +31 -0
  207. unstructured_ingest/connector/notion/types/blocks/pdf.py +49 -0
  208. unstructured_ingest/connector/notion/types/blocks/quote.py +37 -0
  209. unstructured_ingest/connector/notion/types/blocks/synced_block.py +57 -0
  210. unstructured_ingest/connector/notion/types/blocks/table.py +63 -0
  211. unstructured_ingest/connector/notion/types/blocks/table_of_contents.py +23 -0
  212. unstructured_ingest/connector/notion/types/blocks/template.py +30 -0
  213. unstructured_ingest/connector/notion/types/blocks/todo.py +42 -0
  214. unstructured_ingest/connector/notion/types/blocks/toggle.py +37 -0
  215. unstructured_ingest/connector/notion/types/blocks/unsupported.py +20 -0
  216. unstructured_ingest/connector/notion/types/blocks/video.py +22 -0
  217. unstructured_ingest/connector/notion/types/database.py +73 -0
  218. unstructured_ingest/connector/notion/types/database_properties/__init__.py +106 -0
  219. unstructured_ingest/connector/notion/types/database_properties/checkbox.py +38 -0
  220. unstructured_ingest/connector/notion/types/database_properties/created_by.py +35 -0
  221. unstructured_ingest/connector/notion/types/database_properties/created_time.py +34 -0
  222. unstructured_ingest/connector/notion/types/database_properties/date.py +41 -0
  223. unstructured_ingest/connector/notion/types/database_properties/email.py +36 -0
  224. unstructured_ingest/connector/notion/types/database_properties/files.py +37 -0
  225. unstructured_ingest/connector/notion/types/database_properties/formula.py +49 -0
  226. unstructured_ingest/connector/notion/types/database_properties/last_edited_by.py +34 -0
  227. unstructured_ingest/connector/notion/types/database_properties/last_edited_time.py +34 -0
  228. unstructured_ingest/connector/notion/types/database_properties/multiselect.py +73 -0
  229. unstructured_ingest/connector/notion/types/database_properties/number.py +49 -0
  230. unstructured_ingest/connector/notion/types/database_properties/people.py +40 -0
  231. unstructured_ingest/connector/notion/types/database_properties/phone_number.py +36 -0
  232. unstructured_ingest/connector/notion/types/database_properties/relation.py +67 -0
  233. unstructured_ingest/connector/notion/types/database_properties/rich_text.py +43 -0
  234. unstructured_ingest/connector/notion/types/database_properties/rollup.py +56 -0
  235. unstructured_ingest/connector/notion/types/database_properties/select.py +68 -0
  236. unstructured_ingest/connector/notion/types/database_properties/status.py +80 -0
  237. unstructured_ingest/connector/notion/types/database_properties/title.py +37 -0
  238. unstructured_ingest/connector/notion/types/database_properties/unique_id.py +50 -0
  239. unstructured_ingest/connector/notion/types/database_properties/url.py +37 -0
  240. unstructured_ingest/connector/notion/types/database_properties/verification.py +78 -0
  241. unstructured_ingest/connector/notion/types/date.py +26 -0
  242. unstructured_ingest/connector/notion/types/file.py +51 -0
  243. unstructured_ingest/connector/notion/types/page.py +45 -0
  244. unstructured_ingest/connector/notion/types/parent.py +66 -0
  245. unstructured_ingest/connector/notion/types/rich_text.py +189 -0
  246. unstructured_ingest/connector/notion/types/user.py +76 -0
  247. unstructured_ingest/connector/onedrive.py +232 -0
  248. unstructured_ingest/connector/opensearch.py +218 -0
  249. unstructured_ingest/connector/outlook.py +285 -0
  250. unstructured_ingest/connector/pinecone.py +140 -0
  251. unstructured_ingest/connector/qdrant.py +144 -0
  252. unstructured_ingest/connector/reddit.py +166 -0
  253. unstructured_ingest/connector/registry.py +109 -0
  254. unstructured_ingest/connector/salesforce.py +301 -0
  255. unstructured_ingest/connector/sharepoint.py +573 -0
  256. unstructured_ingest/connector/slack.py +224 -0
  257. unstructured_ingest/connector/sql.py +199 -0
  258. unstructured_ingest/connector/vectara.py +253 -0
  259. unstructured_ingest/connector/weaviate.py +190 -0
  260. unstructured_ingest/connector/wikipedia.py +208 -0
  261. unstructured_ingest/embed/__init__.py +0 -0
  262. unstructured_ingest/embed/azure_openai.py +31 -0
  263. unstructured_ingest/embed/bedrock.py +193 -0
  264. unstructured_ingest/embed/huggingface.py +52 -0
  265. unstructured_ingest/embed/interfaces.py +117 -0
  266. unstructured_ingest/embed/mixedbreadai.py +233 -0
  267. unstructured_ingest/embed/octoai.py +130 -0
  268. unstructured_ingest/embed/openai.py +116 -0
  269. unstructured_ingest/embed/togetherai.py +106 -0
  270. unstructured_ingest/embed/vertexai.py +126 -0
  271. unstructured_ingest/embed/voyageai.py +130 -0
  272. unstructured_ingest/enhanced_dataclass/__init__.py +4 -0
  273. unstructured_ingest/enhanced_dataclass/core.py +99 -0
  274. unstructured_ingest/enhanced_dataclass/dataclasses.py +54 -0
  275. unstructured_ingest/enhanced_dataclass/json_mixin.py +125 -0
  276. unstructured_ingest/error.py +49 -0
  277. unstructured_ingest/ingest_backoff/__init__.py +3 -0
  278. unstructured_ingest/ingest_backoff/_common.py +102 -0
  279. unstructured_ingest/ingest_backoff/_wrapper.py +122 -0
  280. unstructured_ingest/interfaces.py +852 -0
  281. unstructured_ingest/logger.py +130 -0
  282. unstructured_ingest/main.py +11 -0
  283. unstructured_ingest/pipeline/__init__.py +22 -0
  284. unstructured_ingest/pipeline/copy.py +19 -0
  285. unstructured_ingest/pipeline/doc_factory.py +12 -0
  286. unstructured_ingest/pipeline/interfaces.py +270 -0
  287. unstructured_ingest/pipeline/partition.py +60 -0
  288. unstructured_ingest/pipeline/permissions.py +12 -0
  289. unstructured_ingest/pipeline/pipeline.py +117 -0
  290. unstructured_ingest/pipeline/reformat/__init__.py +0 -0
  291. unstructured_ingest/pipeline/reformat/chunking.py +134 -0
  292. unstructured_ingest/pipeline/reformat/embedding.py +64 -0
  293. unstructured_ingest/pipeline/source.py +77 -0
  294. unstructured_ingest/pipeline/utils.py +6 -0
  295. unstructured_ingest/pipeline/write.py +18 -0
  296. unstructured_ingest/processor.py +93 -0
  297. unstructured_ingest/runner/__init__.py +104 -0
  298. unstructured_ingest/runner/airtable.py +35 -0
  299. unstructured_ingest/runner/astradb.py +34 -0
  300. unstructured_ingest/runner/base_runner.py +89 -0
  301. unstructured_ingest/runner/biomed.py +45 -0
  302. unstructured_ingest/runner/confluence.py +35 -0
  303. unstructured_ingest/runner/delta_table.py +34 -0
  304. unstructured_ingest/runner/discord.py +35 -0
  305. unstructured_ingest/runner/elasticsearch.py +40 -0
  306. unstructured_ingest/runner/fsspec/__init__.py +0 -0
  307. unstructured_ingest/runner/fsspec/azure.py +30 -0
  308. unstructured_ingest/runner/fsspec/box.py +28 -0
  309. unstructured_ingest/runner/fsspec/dropbox.py +30 -0
  310. unstructured_ingest/runner/fsspec/fsspec.py +40 -0
  311. unstructured_ingest/runner/fsspec/gcs.py +28 -0
  312. unstructured_ingest/runner/fsspec/s3.py +28 -0
  313. unstructured_ingest/runner/fsspec/sftp.py +28 -0
  314. unstructured_ingest/runner/github.py +37 -0
  315. unstructured_ingest/runner/gitlab.py +37 -0
  316. unstructured_ingest/runner/google_drive.py +35 -0
  317. unstructured_ingest/runner/hubspot.py +35 -0
  318. unstructured_ingest/runner/jira.py +35 -0
  319. unstructured_ingest/runner/kafka.py +34 -0
  320. unstructured_ingest/runner/local.py +23 -0
  321. unstructured_ingest/runner/mongodb.py +34 -0
  322. unstructured_ingest/runner/notion.py +61 -0
  323. unstructured_ingest/runner/onedrive.py +35 -0
  324. unstructured_ingest/runner/opensearch.py +40 -0
  325. unstructured_ingest/runner/outlook.py +33 -0
  326. unstructured_ingest/runner/reddit.py +35 -0
  327. unstructured_ingest/runner/salesforce.py +33 -0
  328. unstructured_ingest/runner/sharepoint.py +35 -0
  329. unstructured_ingest/runner/slack.py +33 -0
  330. unstructured_ingest/runner/utils.py +47 -0
  331. unstructured_ingest/runner/wikipedia.py +35 -0
  332. unstructured_ingest/runner/writers/__init__.py +48 -0
  333. unstructured_ingest/runner/writers/astradb.py +22 -0
  334. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  335. unstructured_ingest/runner/writers/base_writer.py +26 -0
  336. unstructured_ingest/runner/writers/chroma.py +22 -0
  337. unstructured_ingest/runner/writers/clarifai.py +19 -0
  338. unstructured_ingest/runner/writers/databricks_volumes.py +25 -0
  339. unstructured_ingest/runner/writers/delta_table.py +24 -0
  340. unstructured_ingest/runner/writers/elasticsearch.py +24 -0
  341. unstructured_ingest/runner/writers/fsspec/__init__.py +0 -0
  342. unstructured_ingest/runner/writers/fsspec/azure.py +24 -0
  343. unstructured_ingest/runner/writers/fsspec/box.py +21 -0
  344. unstructured_ingest/runner/writers/fsspec/dropbox.py +21 -0
  345. unstructured_ingest/runner/writers/fsspec/gcs.py +19 -0
  346. unstructured_ingest/runner/writers/fsspec/s3.py +21 -0
  347. unstructured_ingest/runner/writers/kafka.py +21 -0
  348. unstructured_ingest/runner/writers/mongodb.py +21 -0
  349. unstructured_ingest/runner/writers/opensearch.py +26 -0
  350. unstructured_ingest/runner/writers/pinecone.py +21 -0
  351. unstructured_ingest/runner/writers/qdrant.py +19 -0
  352. unstructured_ingest/runner/writers/sql.py +22 -0
  353. unstructured_ingest/runner/writers/vectara.py +22 -0
  354. unstructured_ingest/runner/writers/weaviate.py +21 -0
  355. unstructured_ingest/utils/__init__.py +0 -0
  356. unstructured_ingest/utils/chunking.py +56 -0
  357. unstructured_ingest/utils/compression.py +118 -0
  358. unstructured_ingest/utils/data_prep.py +200 -0
  359. unstructured_ingest/utils/dep_check.py +78 -0
  360. unstructured_ingest/utils/google_filetype.py +9 -0
  361. unstructured_ingest/utils/string_and_date_utils.py +49 -0
  362. unstructured_ingest/utils/table.py +73 -0
  363. unstructured_ingest/v2/__init__.py +1 -0
  364. unstructured_ingest/v2/cli/__init__.py +0 -0
  365. unstructured_ingest/v2/cli/base/__init__.py +4 -0
  366. unstructured_ingest/v2/cli/base/cmd.py +269 -0
  367. unstructured_ingest/v2/cli/base/dest.py +85 -0
  368. unstructured_ingest/v2/cli/base/importer.py +34 -0
  369. unstructured_ingest/v2/cli/base/src.py +85 -0
  370. unstructured_ingest/v2/cli/cli.py +24 -0
  371. unstructured_ingest/v2/cli/cmds.py +14 -0
  372. unstructured_ingest/v2/cli/utils/__init__.py +0 -0
  373. unstructured_ingest/v2/cli/utils/click.py +237 -0
  374. unstructured_ingest/v2/cli/utils/model_conversion.py +222 -0
  375. unstructured_ingest/v2/constants.py +2 -0
  376. unstructured_ingest/v2/errors.py +18 -0
  377. unstructured_ingest/v2/interfaces/__init__.py +32 -0
  378. unstructured_ingest/v2/interfaces/connector.py +50 -0
  379. unstructured_ingest/v2/interfaces/downloader.py +89 -0
  380. unstructured_ingest/v2/interfaces/file_data.py +116 -0
  381. unstructured_ingest/v2/interfaces/indexer.py +30 -0
  382. unstructured_ingest/v2/interfaces/process.py +19 -0
  383. unstructured_ingest/v2/interfaces/processor.py +88 -0
  384. unstructured_ingest/v2/interfaces/upload_stager.py +102 -0
  385. unstructured_ingest/v2/interfaces/uploader.py +53 -0
  386. unstructured_ingest/v2/logger.py +126 -0
  387. unstructured_ingest/v2/main.py +11 -0
  388. unstructured_ingest/v2/otel.py +111 -0
  389. unstructured_ingest/v2/pipeline/__init__.py +0 -0
  390. unstructured_ingest/v2/pipeline/interfaces.py +211 -0
  391. unstructured_ingest/v2/pipeline/otel.py +32 -0
  392. unstructured_ingest/v2/pipeline/pipeline.py +384 -0
  393. unstructured_ingest/v2/pipeline/steps/__init__.py +0 -0
  394. unstructured_ingest/v2/pipeline/steps/chunk.py +80 -0
  395. unstructured_ingest/v2/pipeline/steps/download.py +207 -0
  396. unstructured_ingest/v2/pipeline/steps/embed.py +79 -0
  397. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  398. unstructured_ingest/v2/pipeline/steps/index.py +86 -0
  399. unstructured_ingest/v2/pipeline/steps/partition.py +79 -0
  400. unstructured_ingest/v2/pipeline/steps/stage.py +65 -0
  401. unstructured_ingest/v2/pipeline/steps/uncompress.py +50 -0
  402. unstructured_ingest/v2/pipeline/steps/upload.py +58 -0
  403. unstructured_ingest/v2/processes/__init__.py +18 -0
  404. unstructured_ingest/v2/processes/chunker.py +124 -0
  405. unstructured_ingest/v2/processes/connector_registry.py +69 -0
  406. unstructured_ingest/v2/processes/connectors/__init__.py +117 -0
  407. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  408. unstructured_ingest/v2/processes/connectors/astradb.py +402 -0
  409. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +276 -0
  410. unstructured_ingest/v2/processes/connectors/chroma.py +190 -0
  411. unstructured_ingest/v2/processes/connectors/confluence.py +207 -0
  412. unstructured_ingest/v2/processes/connectors/couchbase.py +334 -0
  413. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  414. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +208 -0
  415. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  416. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  417. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  418. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  419. unstructured_ingest/v2/processes/connectors/delta_table.py +191 -0
  420. unstructured_ingest/v2/processes/connectors/discord.py +158 -0
  421. unstructured_ingest/v2/processes/connectors/duckdb/__init__.py +15 -0
  422. unstructured_ingest/v2/processes/connectors/duckdb/base.py +100 -0
  423. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +127 -0
  424. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +126 -0
  425. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  426. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +470 -0
  427. unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py +195 -0
  428. unstructured_ingest/v2/processes/connectors/fsspec/__init__.py +37 -0
  429. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +197 -0
  430. unstructured_ingest/v2/processes/connectors/fsspec/box.py +170 -0
  431. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +168 -0
  432. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +332 -0
  433. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +197 -0
  434. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +185 -0
  435. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +171 -0
  436. unstructured_ingest/v2/processes/connectors/fsspec/utils.py +17 -0
  437. unstructured_ingest/v2/processes/connectors/gitlab.py +268 -0
  438. unstructured_ingest/v2/processes/connectors/google_drive.py +348 -0
  439. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  440. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +121 -0
  441. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +273 -0
  442. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  443. unstructured_ingest/v2/processes/connectors/kdbai.py +148 -0
  444. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +30 -0
  445. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  446. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  447. unstructured_ingest/v2/processes/connectors/lancedb/cloud.py +42 -0
  448. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  449. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +169 -0
  450. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  451. unstructured_ingest/v2/processes/connectors/local.py +217 -0
  452. unstructured_ingest/v2/processes/connectors/milvus.py +225 -0
  453. unstructured_ingest/v2/processes/connectors/mongodb.py +361 -0
  454. unstructured_ingest/v2/processes/connectors/neo4j.py +385 -0
  455. unstructured_ingest/v2/processes/connectors/notion/__init__.py +0 -0
  456. unstructured_ingest/v2/processes/connectors/notion/client.py +349 -0
  457. unstructured_ingest/v2/processes/connectors/notion/connector.py +346 -0
  458. unstructured_ingest/v2/processes/connectors/notion/helpers.py +448 -0
  459. unstructured_ingest/v2/processes/connectors/notion/interfaces.py +32 -0
  460. unstructured_ingest/v2/processes/connectors/notion/types/__init__.py +0 -0
  461. unstructured_ingest/v2/processes/connectors/notion/types/block.py +96 -0
  462. unstructured_ingest/v2/processes/connectors/notion/types/blocks/__init__.py +63 -0
  463. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bookmark.py +40 -0
  464. unstructured_ingest/v2/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
  465. unstructured_ingest/v2/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
  466. unstructured_ingest/v2/processes/connectors/notion/types/blocks/callout.py +94 -0
  467. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_database.py +23 -0
  468. unstructured_ingest/v2/processes/connectors/notion/types/blocks/child_page.py +23 -0
  469. unstructured_ingest/v2/processes/connectors/notion/types/blocks/code.py +43 -0
  470. unstructured_ingest/v2/processes/connectors/notion/types/blocks/column_list.py +35 -0
  471. unstructured_ingest/v2/processes/connectors/notion/types/blocks/divider.py +22 -0
  472. unstructured_ingest/v2/processes/connectors/notion/types/blocks/embed.py +36 -0
  473. unstructured_ingest/v2/processes/connectors/notion/types/blocks/equation.py +23 -0
  474. unstructured_ingest/v2/processes/connectors/notion/types/blocks/file.py +49 -0
  475. unstructured_ingest/v2/processes/connectors/notion/types/blocks/heading.py +37 -0
  476. unstructured_ingest/v2/processes/connectors/notion/types/blocks/image.py +21 -0
  477. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_preview.py +24 -0
  478. unstructured_ingest/v2/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
  479. unstructured_ingest/v2/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
  480. unstructured_ingest/v2/processes/connectors/notion/types/blocks/paragraph.py +31 -0
  481. unstructured_ingest/v2/processes/connectors/notion/types/blocks/pdf.py +49 -0
  482. unstructured_ingest/v2/processes/connectors/notion/types/blocks/quote.py +37 -0
  483. unstructured_ingest/v2/processes/connectors/notion/types/blocks/synced_block.py +57 -0
  484. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table.py +63 -0
  485. unstructured_ingest/v2/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
  486. unstructured_ingest/v2/processes/connectors/notion/types/blocks/template.py +30 -0
  487. unstructured_ingest/v2/processes/connectors/notion/types/blocks/todo.py +42 -0
  488. unstructured_ingest/v2/processes/connectors/notion/types/blocks/toggle.py +37 -0
  489. unstructured_ingest/v2/processes/connectors/notion/types/blocks/unsupported.py +20 -0
  490. unstructured_ingest/v2/processes/connectors/notion/types/blocks/video.py +22 -0
  491. unstructured_ingest/v2/processes/connectors/notion/types/database.py +73 -0
  492. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/__init__.py +106 -0
  493. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/checkbox.py +38 -0
  494. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_by.py +35 -0
  495. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/created_time.py +34 -0
  496. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/date.py +41 -0
  497. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/email.py +36 -0
  498. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/files.py +37 -0
  499. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/formula.py +49 -0
  500. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
  501. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/last_edited_time.py +34 -0
  502. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/multiselect.py +73 -0
  503. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/number.py +49 -0
  504. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/people.py +41 -0
  505. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/phone_number.py +36 -0
  506. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/relation.py +67 -0
  507. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rich_text.py +43 -0
  508. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/rollup.py +56 -0
  509. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/select.py +69 -0
  510. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/status.py +81 -0
  511. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/title.py +37 -0
  512. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/unique_id.py +50 -0
  513. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/url.py +37 -0
  514. unstructured_ingest/v2/processes/connectors/notion/types/database_properties/verification.py +78 -0
  515. unstructured_ingest/v2/processes/connectors/notion/types/date.py +29 -0
  516. unstructured_ingest/v2/processes/connectors/notion/types/file.py +54 -0
  517. unstructured_ingest/v2/processes/connectors/notion/types/page.py +45 -0
  518. unstructured_ingest/v2/processes/connectors/notion/types/parent.py +66 -0
  519. unstructured_ingest/v2/processes/connectors/notion/types/rich_text.py +189 -0
  520. unstructured_ingest/v2/processes/connectors/notion/types/user.py +79 -0
  521. unstructured_ingest/v2/processes/connectors/onedrive.py +447 -0
  522. unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
  523. unstructured_ingest/v2/processes/connectors/pinecone.py +277 -0
  524. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  525. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  526. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  527. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +160 -0
  528. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  529. unstructured_ingest/v2/processes/connectors/redisdb.py +182 -0
  530. unstructured_ingest/v2/processes/connectors/salesforce.py +303 -0
  531. unstructured_ingest/v2/processes/connectors/sharepoint.py +448 -0
  532. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  533. unstructured_ingest/v2/processes/connectors/sql/__init__.py +27 -0
  534. unstructured_ingest/v2/processes/connectors/sql/postgres.py +162 -0
  535. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +166 -0
  536. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +210 -0
  537. unstructured_ingest/v2/processes/connectors/sql/sql.py +434 -0
  538. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +168 -0
  539. unstructured_ingest/v2/processes/connectors/utils.py +29 -0
  540. unstructured_ingest/v2/processes/connectors/vectara.py +350 -0
  541. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  542. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +165 -0
  543. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  544. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  545. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +267 -0
  546. unstructured_ingest/v2/processes/embedder.py +195 -0
  547. unstructured_ingest/v2/processes/filter.py +60 -0
  548. unstructured_ingest/v2/processes/partitioner.py +188 -0
  549. unstructured_ingest/v2/processes/uncompress.py +61 -0
  550. unstructured_ingest/v2/unstructured_api.py +128 -0
  551. unstructured_ingest/v2/utils.py +61 -0
  552. unstructured_ingest-0.3.13.dist-info/LICENSE.md +201 -0
  553. unstructured_ingest-0.3.13.dist-info/METADATA +205 -0
  554. unstructured_ingest-0.3.13.dist-info/RECORD +557 -0
  555. unstructured_ingest-0.3.13.dist-info/WHEEL +5 -0
  556. unstructured_ingest-0.3.13.dist-info/entry_points.txt +2 -0
  557. unstructured_ingest-0.3.13.dist-info/top_level.txt +2 -0
@@ -0,0 +1,448 @@
1
+ import json
2
+ from dataclasses import dataclass, field
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from time import time
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
+ from urllib.parse import quote
8
+
9
+ from pydantic import BaseModel, Field, Secret, SecretStr
10
+
11
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.utils.dep_check import requires_dependencies
13
+ from unstructured_ingest.v2.interfaces import (
14
+ AccessConfig,
15
+ ConnectionConfig,
16
+ Downloader,
17
+ DownloaderConfig,
18
+ DownloadResponse,
19
+ FileData,
20
+ FileDataSourceMetadata,
21
+ Indexer,
22
+ IndexerConfig,
23
+ SourceIdentifiers,
24
+ )
25
+ from unstructured_ingest.v2.logger import logger
26
+ from unstructured_ingest.v2.processes.connector_registry import (
27
+ SourceRegistryEntry,
28
+ )
29
+
30
+ from .utils import parse_datetime
31
+
32
+ if TYPE_CHECKING:
33
+ from office365.graph_client import GraphClient
34
+ from office365.onedrive.driveitems.driveItem import DriveItem
35
+ from office365.onedrive.drives.drive import Drive
36
+ from office365.onedrive.permissions.permission import Permission
37
+ from office365.onedrive.sites.site import Site
38
+ from office365.sharepoint.client_context import ClientContext
39
+ from office365.sharepoint.files.file import File
40
+ from office365.sharepoint.folders.folder import Folder
41
+ from office365.sharepoint.publishing.pages.page import SitePage
42
+
43
+ CONNECTOR_TYPE = "sharepoint"
44
+
45
+ MAX_MB_SIZE = 512_000_000
46
+
47
+ # TODO handle other data types possible from Sharepoint
48
+ # exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint
49
+
50
+
51
+ class SharepointContentType(Enum):
52
+ DOCUMENT = "document"
53
+ SITEPAGE = "site_page"
54
+ LIST = "list"
55
+
56
+
57
+ class SharepointAccessConfig(AccessConfig):
58
+ client_cred: str = Field(description="Sharepoint app secret")
59
+
60
+
61
+ class SharepointPermissionsConfig(BaseModel):
62
+ permissions_application_id: Optional[str] = Field(
63
+ default=None, description="Microsoft Graph API application id"
64
+ )
65
+ permissions_tenant: Optional[str] = Field(
66
+ default=None,
67
+ description="url to get permissions data within tenant.",
68
+ examples=["https://contoso.onmicrosoft.com"],
69
+ )
70
+ permissions_client_cred: Optional[SecretStr] = Field(
71
+ default=None, description="Microsoft Graph API application credentials"
72
+ )
73
+ authority_url: Optional[SecretStr] = Field(
74
+ repr=False,
75
+ default_factory=lambda: SecretStr(secret_value="https://login.microsoftonline.com"),
76
+ description="Permissions authority url",
77
+ examples=["https://login.microsoftonline.com"],
78
+ )
79
+
80
+
81
+ class SharepointConnectionConfig(ConnectionConfig):
82
+ client_id: str = Field(description="Sharepoint app client ID")
83
+ site: str = Field(
84
+ description="Sharepoint site url. Process either base url e.g \
85
+ https://[tenant].sharepoint.com or relative sites \
86
+ https://[tenant].sharepoint.com/sites/<site_name>. \
87
+ To process all sites within the tenant pass a site url as \
88
+ https://[tenant]-admin.sharepoint.com.\
89
+ This requires the app to be registered at a tenant level"
90
+ )
91
+ access_config: Secret[SharepointAccessConfig]
92
+ permissions_config: Optional[SharepointPermissionsConfig] = None
93
+
94
+ @requires_dependencies(["office365"], extras="sharepoint")
95
+ def get_client(self) -> "ClientContext":
96
+ from office365.runtime.auth.client_credential import ClientCredential
97
+ from office365.sharepoint.client_context import ClientContext
98
+
99
+ try:
100
+ credentials = ClientCredential(
101
+ self.client_id, self.access_config.get_secret_value().client_cred
102
+ )
103
+ site_client = ClientContext(self.site).with_credentials(credentials)
104
+ except Exception as e:
105
+ logger.error(f"Couldn't set Sharepoint client: {e}")
106
+ raise e
107
+ return site_client
108
+
109
+ @requires_dependencies(["msal"], extras="sharepoint")
110
+ def get_permissions_token(self):
111
+ from msal import ConfidentialClientApplication
112
+
113
+ try:
114
+ client_credential = self.permissions_config.permissions_client_cred.get_secret_value()
115
+ app = ConfidentialClientApplication(
116
+ authority=f"{self.permissions_config.authority_url.get_secret_value()}/"
117
+ f"{self.permissions_config.permissions_tenant}",
118
+ client_id=self.permissions_config.permissions_application_id,
119
+ client_credential=client_credential,
120
+ )
121
+ token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
122
+ except ValueError as exc:
123
+ logger.error("Couldn't set up credentials for Sharepoint")
124
+ raise exc
125
+ if "error" in token:
126
+ raise SourceConnectionNetworkError(
127
+ "failed to fetch token, {}: {}".format(token["error"], token["error_description"])
128
+ )
129
+ return token
130
+
131
+ @requires_dependencies(["office365"], extras="sharepoint")
132
+ def get_permissions_client(self) -> Optional["GraphClient"]:
133
+ from office365.graph_client import GraphClient
134
+
135
+ if self.permissions_config is None:
136
+ return None
137
+
138
+ client = GraphClient(self.get_permissions_token)
139
+ return client
140
+
141
+
142
+ class SharepointIndexerConfig(IndexerConfig):
143
+ path: Optional[str] = Field(
144
+ default=None,
145
+ description="Path from which to start parsing files. If the connector is to \
146
+ process all sites within the tenant this filter will be applied to \
147
+ all sites document libraries.",
148
+ )
149
+ recursive: bool = Field(
150
+ default=False,
151
+ description="Recursively download files in their respective folders "
152
+ "otherwise stop at the files in provided folder level.",
153
+ )
154
+ omit_files: bool = Field(default=False, description="Don't process files.")
155
+ omit_pages: bool = Field(default=False, description="Don't process site pages.")
156
+ omit_lists: bool = Field(default=False, description="Don't process lists.")
157
+
158
+
159
+ @dataclass
160
+ class SharepointIndexer(Indexer):
161
+ connection_config: SharepointConnectionConfig
162
+ index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
163
+
164
+ def precheck(self) -> None:
165
+ try:
166
+ site_client = self.connection_config.get_client()
167
+ site_client.site_pages.pages.get().execute_query()
168
+ except Exception as e:
169
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
170
+ raise SourceConnectionError(f"failed to validate connection: {e}")
171
+
172
+ def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
173
+ if not recursive:
174
+ folder.expand(["Files"]).get().execute_query()
175
+ return folder.files
176
+
177
+ folder.expand(["Files", "Folders"]).get().execute_query()
178
+ files: list["File"] = list(folder.files)
179
+ folders: list["Folder"] = list(folder.folders)
180
+ for f in folders:
181
+ if "/Forms" in f.serverRelativeUrl:
182
+ continue
183
+ files.extend(self.list_files(f, recursive))
184
+ return files
185
+
186
+ def get_properties(self, raw_properties: dict) -> dict:
187
+ raw_properties = {k: v for k, v in raw_properties.items() if v}
188
+ filtered_properties = {}
189
+ for k, v in raw_properties.items():
190
+ try:
191
+ json.dumps(v)
192
+ filtered_properties[k] = v
193
+ except TypeError:
194
+ pass
195
+ return filtered_properties
196
+
197
+ def list_pages(self, client: "ClientContext") -> list["SitePage"]:
198
+ pages = client.site_pages.pages.get().execute_query()
199
+ return pages
200
+
201
+ def page_to_file_data(self, site_page: "SitePage") -> FileData:
202
+ site_page.expand(site_page.properties.keys()).get().execute_query()
203
+ version = site_page.properties.get("Version", None)
204
+ unique_id = site_page.properties.get("UniqueId", None)
205
+ modified_date = site_page.properties.get("Modified", None)
206
+ url = site_page.properties.get("AbsoluteUrl", None)
207
+ date_modified_dt = parse_datetime(modified_date) if modified_date else None
208
+ date_created_at = (
209
+ parse_datetime(site_page.first_published)
210
+ if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z")
211
+ else None
212
+ )
213
+ file_path = site_page.get_property("Url", "")
214
+ server_path = file_path if file_path[0] != "/" else file_path[1:]
215
+ additional_metadata = self.get_properties(raw_properties=site_page.properties)
216
+ additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value
217
+ return FileData(
218
+ identifier=unique_id,
219
+ connector_type=CONNECTOR_TYPE,
220
+ source_identifiers=SourceIdentifiers(
221
+ filename=site_page.file_name,
222
+ fullpath=file_path,
223
+ rel_path=file_path.replace(self.index_config.path, ""),
224
+ ),
225
+ metadata=FileDataSourceMetadata(
226
+ url=url,
227
+ version=version,
228
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
229
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
230
+ date_processed=str(time()),
231
+ record_locator={
232
+ "server_path": server_path,
233
+ },
234
+ ),
235
+ additional_metadata=additional_metadata,
236
+ )
237
+
238
+ def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData:
239
+ file.expand(file.properties.keys()).get().execute_query()
240
+ absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}"
241
+ date_modified_dt = (
242
+ parse_datetime(file.time_last_modified) if file.time_last_modified else None
243
+ )
244
+
245
+ date_created_at = parse_datetime(file.time_created) if file.time_created else None
246
+ additional_metadata = self.get_properties(raw_properties=file.properties)
247
+ additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value
248
+ fullpath = str(file.serverRelativeUrl)
249
+ rel_path = fullpath.replace(self.index_config.path, "")
250
+ while rel_path[0] == "/":
251
+ rel_path = rel_path[1:]
252
+ return FileData(
253
+ identifier=file.unique_id,
254
+ connector_type=CONNECTOR_TYPE,
255
+ source_identifiers=SourceIdentifiers(
256
+ filename=file.name,
257
+ fullpath=fullpath,
258
+ rel_path=rel_path,
259
+ ),
260
+ metadata=FileDataSourceMetadata(
261
+ url=absolute_url,
262
+ version=f"{file.major_version}.{file.minor_version}",
263
+ date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
264
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
265
+ date_processed=str(time()),
266
+ record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url},
267
+ ),
268
+ additional_metadata=additional_metadata,
269
+ )
270
+
271
+ def get_root(self, client: "ClientContext") -> "Folder":
272
+ if path := self.index_config.path:
273
+ return client.web.get_folder_by_server_relative_path(path)
274
+ default_document_library = client.web.default_document_library()
275
+ root_folder = default_document_library.root_folder
276
+ root_folder = root_folder.get().execute_query()
277
+ self.index_config.path = root_folder.name
278
+ return root_folder
279
+
280
+ def get_site_url(self, client: "ClientContext") -> str:
281
+ res = client.web.get().execute_query()
282
+ return res.url
283
+
284
+ def get_site(self, permissions_client: "GraphClient", site_url) -> "Site":
285
+ return permissions_client.sites.get_by_url(url=site_url).execute_query()
286
+
287
+ def get_permissions_items(self, site: "Site") -> list["DriveItem"]:
288
+ # TODO find a way to narrow this search down by name of drive
289
+ items: list["DriveItem"] = []
290
+ drives: list["Drive"] = site.drives.get_all().execute_query()
291
+ for drive in drives:
292
+ items.extend(drive.root.children.get_all().execute_query())
293
+ return items
294
+
295
+ def map_permission(self, permission: "Permission") -> dict:
296
+ return {
297
+ "id": permission.id,
298
+ "roles": list(permission.roles),
299
+ "share_id": permission.share_id,
300
+ "has_password": permission.has_password,
301
+ "link": permission.link.to_json(),
302
+ "granted_to_identities": permission.granted_to_identities.to_json(),
303
+ "granted_to": permission.granted_to.to_json(),
304
+ "granted_to_v2": permission.granted_to_v2.to_json(),
305
+ "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(),
306
+ "invitation": permission.invitation.to_json(),
307
+ }
308
+
309
+ def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None:
310
+ logger.debug("Enriching permissions on files")
311
+ permission_client = self.connection_config.get_permissions_client()
312
+ if permission_client is None:
313
+ return
314
+ site = self.get_site(permissions_client=permission_client, site_url=site_url)
315
+ existing_items = self.get_permissions_items(site=site)
316
+ for file_data in all_file_data:
317
+ etag = file_data.additional_metadata.get("ETag")
318
+ if not etag:
319
+ continue
320
+ matching_items = list(filter(lambda x: x.etag == etag, existing_items))
321
+ if not matching_items:
322
+ continue
323
+ if len(matching_items) > 1:
324
+ logger.warning(
325
+ "Found multiple drive items with etag matching {}, skipping: {}".format(
326
+ etag, ", ".join([i.name for i in matching_items])
327
+ )
328
+ )
329
+ continue
330
+ matching_item = matching_items[0]
331
+ permissions: list["Permission"] = matching_item.permissions.get_all().execute_query()
332
+ permissions_data = [
333
+ self.map_permission(permission=permission) for permission in permissions
334
+ ]
335
+ file_data.metadata.permissions_data = permissions_data
336
+
337
+ @property
338
+ def process_permissions(self) -> bool:
339
+ return (
340
+ self.connection_config.permissions_config is not None
341
+ and self.connection_config.permissions_config.permissions_tenant
342
+ and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
343
+ and self.connection_config.permissions_config.permissions_application_id
344
+ )
345
+
346
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
347
+ client = self.connection_config.get_client()
348
+ root_folder = self.get_root(client=client)
349
+ logger.debug(f"processing content from path: {self.index_config.path}")
350
+ if not self.index_config.omit_files:
351
+ files = self.list_files(root_folder, recursive=self.index_config.recursive)
352
+ file_data = [self.file_to_file_data(file=file, client=client) for file in files]
353
+ if self.process_permissions:
354
+ self.enrich_permissions_on_files(
355
+ all_file_data=file_data, site_url=self.get_site_url(client=client)
356
+ )
357
+ for file in file_data:
358
+ yield file
359
+ if not self.index_config.omit_pages:
360
+ pages = self.list_pages(client=client)
361
+ for page in pages:
362
+ file_data = self.page_to_file_data(site_page=page)
363
+ file_data.metadata.record_locator["site_url"] = client.base_url
364
+ yield file_data
365
+
366
+
367
+ class SharepointDownloaderConfig(DownloaderConfig):
368
+ pass
369
+
370
+
371
+ @dataclass
372
+ class SharepointDownloader(Downloader):
373
+ connection_config: SharepointConnectionConfig
374
+ download_config: SharepointDownloaderConfig
375
+ connector_type: str = CONNECTOR_TYPE
376
+
377
+ def get_download_path(self, file_data: FileData) -> Path:
378
+ download_path = super().get_download_path(file_data=file_data)
379
+
380
+ content_type = file_data.additional_metadata.get("sharepoint_content_type")
381
+ if content_type == SharepointContentType.SITEPAGE.value:
382
+ # Update output extension to html if site page
383
+ download_path = download_path.with_suffix(".html")
384
+ return download_path
385
+
386
+ def get_document(self, file_data: FileData) -> DownloadResponse:
387
+ client: "ClientContext" = self.connection_config.get_client()
388
+ file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier)
389
+ download_path = self.get_download_path(file_data=file_data)
390
+ download_path.parent.mkdir(parents=True, exist_ok=True)
391
+ logger.debug(
392
+ f"writing document content {file_data.source_identifiers.fullpath} to {download_path}"
393
+ )
394
+ with download_path.open("wb") as f:
395
+ file.download(f).execute_query()
396
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
397
+
398
+ def get_site_page(self, file_data: FileData) -> DownloadResponse:
399
+ # TODO fetch comments for site page as well
400
+ from lxml import etree, html
401
+
402
+ canvas_content_raw = file_data.additional_metadata.get("CanvasContent1")
403
+ layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent")
404
+ html_content = []
405
+ if layout_web_parts_content_raw:
406
+ layout_web_parts_content = json.loads(layout_web_parts_content_raw)
407
+ for web_part in layout_web_parts_content:
408
+ properties = web_part.get("properties", {})
409
+ if title := properties.get("title"):
410
+ html_content.append(f"<title>{title}</title>")
411
+ if canvas_content_raw:
412
+ canvas_content = json.loads(canvas_content_raw)
413
+ for content in canvas_content:
414
+ if inner_html := content.get("innerHTML"):
415
+ html_content.append(inner_html)
416
+ htmls = "".join(html_content)
417
+ content = f"<div>{htmls}</div>"
418
+ document = html.fromstring(content)
419
+ download_path = self.get_download_path(file_data=file_data)
420
+ download_path.parent.mkdir(parents=True, exist_ok=True)
421
+ logger.debug(
422
+ f"writing site page content {file_data.source_identifiers.filename} to {download_path}"
423
+ )
424
+ with download_path.open("w") as f:
425
+ f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
426
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
427
+
428
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
429
+ content_type = file_data.additional_metadata.get("sharepoint_content_type")
430
+ if not content_type:
431
+ raise ValueError(
432
+ f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}"
433
+ )
434
+ if content_type == SharepointContentType.DOCUMENT.value:
435
+ return self.get_document(file_data=file_data)
436
+ elif content_type == SharepointContentType.SITEPAGE.value:
437
+ return self.get_site_page(file_data=file_data)
438
+ else:
439
+ raise ValueError(f"content type not recognized: {content_type}")
440
+
441
+
442
+ sharepoint_source_entry = SourceRegistryEntry(
443
+ connection_config=SharepointConnectionConfig,
444
+ indexer_config=SharepointIndexerConfig,
445
+ indexer=SharepointIndexer,
446
+ downloader_config=SharepointDownloaderConfig,
447
+ downloader=SharepointDownloader,
448
+ )
@@ -0,0 +1,248 @@
1
+ import hashlib
2
+ import time
3
+ import xml.etree.ElementTree as ET
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional
8
+
9
+ from pydantic import Field, Secret
10
+
11
+ from unstructured_ingest.error import SourceConnectionError
12
+ from unstructured_ingest.logger import logger
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+ from unstructured_ingest.v2.interfaces import (
15
+ AccessConfig,
16
+ ConnectionConfig,
17
+ Downloader,
18
+ DownloaderConfig,
19
+ DownloadResponse,
20
+ Indexer,
21
+ IndexerConfig,
22
+ )
23
+ from unstructured_ingest.v2.interfaces.file_data import (
24
+ FileData,
25
+ FileDataSourceMetadata,
26
+ SourceIdentifiers,
27
+ )
28
+ from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
29
+
30
+ if TYPE_CHECKING:
31
+ from slack_sdk import WebClient
32
+ from slack_sdk.web.async_client import AsyncWebClient
33
+
34
+ # NOTE: Pagination limit set to the upper end of the recommended range
35
+ # https://api.slack.com/apis/pagination#facts
36
+ PAGINATION_LIMIT = 200
37
+
38
+ CONNECTOR_TYPE = "slack"
39
+
40
+
41
+ class SlackAccessConfig(AccessConfig):
42
+ token: str = Field(
43
+ description="Bot token used to access Slack API, must have channels:history scope for the"
44
+ " bot user."
45
+ )
46
+
47
+
48
+ class SlackConnectionConfig(ConnectionConfig):
49
+ access_config: Secret[SlackAccessConfig]
50
+
51
+ @requires_dependencies(["slack_sdk"], extras="slack")
52
+ @SourceConnectionError.wrap
53
+ def get_client(self) -> "WebClient":
54
+ from slack_sdk import WebClient
55
+
56
+ return WebClient(token=self.access_config.get_secret_value().token)
57
+
58
+ @requires_dependencies(["slack_sdk"], extras="slack")
59
+ @SourceConnectionError.wrap
60
+ def get_async_client(self) -> "AsyncWebClient":
61
+ from slack_sdk.web.async_client import AsyncWebClient
62
+
63
+ return AsyncWebClient(token=self.access_config.get_secret_value().token)
64
+
65
+
66
+ class SlackIndexerConfig(IndexerConfig):
67
+ channels: list[str] = Field(
68
+ description="Comma-delimited list of Slack channel IDs to pull messages from, can be"
69
+ " both public or private channels."
70
+ )
71
+ start_date: Optional[datetime] = Field(
72
+ default=None,
73
+ description="Start date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
74
+ " or YYYY-MM-DD",
75
+ )
76
+ end_date: Optional[datetime] = Field(
77
+ default=None,
78
+ description="End date/time in formats YYYY-MM-DD[T]HH:MM[:SS[.ffffff]][Z or [±]HH[:]MM]"
79
+ " or YYYY-MM-DD",
80
+ )
81
+
82
+
83
+ @dataclass
84
+ class SlackIndexer(Indexer):
85
+ index_config: SlackIndexerConfig
86
+ connection_config: SlackConnectionConfig
87
+ connector_type: str = CONNECTOR_TYPE
88
+
89
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
90
+ client = self.connection_config.get_client()
91
+ for channel in self.index_config.channels:
92
+ messages = []
93
+ oldest = (
94
+ str(self.index_config.start_date.timestamp())
95
+ if self.index_config.start_date is not None
96
+ else None
97
+ )
98
+ latest = (
99
+ str(self.index_config.end_date.timestamp())
100
+ if self.index_config.end_date is not None
101
+ else None
102
+ )
103
+ for conversation_history in client.conversations_history(
104
+ channel=channel,
105
+ oldest=oldest,
106
+ latest=latest,
107
+ limit=PAGINATION_LIMIT,
108
+ ):
109
+ messages = conversation_history.get("messages", [])
110
+ if messages:
111
+ yield self._messages_to_file_data(messages, channel)
112
+
113
+ def _messages_to_file_data(
114
+ self,
115
+ messages: list[dict],
116
+ channel: str,
117
+ ) -> FileData:
118
+ ts_oldest = min((message["ts"] for message in messages), key=lambda m: float(m))
119
+ ts_newest = max((message["ts"] for message in messages), key=lambda m: float(m))
120
+
121
+ identifier_base = f"{channel}-{ts_oldest}-{ts_newest}"
122
+ identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
123
+ filename = identifier[:16]
124
+
125
+ return FileData(
126
+ identifier=identifier,
127
+ connector_type=CONNECTOR_TYPE,
128
+ source_identifiers=SourceIdentifiers(
129
+ filename=f"{filename}.xml", fullpath=f"{filename}.xml"
130
+ ),
131
+ metadata=FileDataSourceMetadata(
132
+ date_created=ts_oldest,
133
+ date_modified=ts_newest,
134
+ date_processed=str(time.time()),
135
+ record_locator={
136
+ "channel": channel,
137
+ "oldest": ts_oldest,
138
+ "latest": ts_newest,
139
+ },
140
+ ),
141
+ )
142
+
143
+ @SourceConnectionError.wrap
144
+ def precheck(self) -> None:
145
+ client = self.connection_config.get_client()
146
+ for channel in self.index_config.channels:
147
+ # NOTE: Querying conversations history guarantees that the bot is in the channel
148
+ client.conversations_history(channel=channel, limit=1)
149
+
150
+
151
+ class SlackDownloaderConfig(DownloaderConfig):
152
+ pass
153
+
154
+
155
+ @dataclass
156
+ class SlackDownloader(Downloader):
157
+ connector_type: str = CONNECTOR_TYPE
158
+ connection_config: SlackConnectionConfig
159
+ download_config: SlackDownloaderConfig = field(default_factory=SlackDownloaderConfig)
160
+
161
+ def run(self, file_data, **kwargs):
162
+ raise NotImplementedError
163
+
164
+ async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
165
+ # NOTE: Indexer should provide source identifiers required to generate the download path
166
+ download_path = self.get_download_path(file_data)
167
+ if download_path is None:
168
+ logger.error(
169
+ "Generated download path is None, source_identifiers might be missing"
170
+ "from FileData."
171
+ )
172
+ raise ValueError("Generated invalid download path.")
173
+
174
+ await self._download_conversation(file_data, download_path)
175
+ return self.generate_download_response(file_data, download_path)
176
+
177
+ def is_async(self):
178
+ return True
179
+
180
+ async def _download_conversation(self, file_data: FileData, download_path: Path) -> None:
181
+ # NOTE: Indexer should supply the record locator in metadata
182
+ if (
183
+ file_data.metadata.record_locator is None
184
+ or "channel" not in file_data.metadata.record_locator
185
+ or "oldest" not in file_data.metadata.record_locator
186
+ or "latest" not in file_data.metadata.record_locator
187
+ ):
188
+ logger.error(
189
+ f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
190
+ "Keys 'channel', 'oldest' and 'latest' must be present."
191
+ )
192
+ raise ValueError("Invalid record locator.")
193
+
194
+ client = self.connection_config.get_async_client()
195
+ messages = []
196
+ async for conversation_history in await client.conversations_history(
197
+ channel=file_data.metadata.record_locator["channel"],
198
+ oldest=file_data.metadata.record_locator["oldest"],
199
+ latest=file_data.metadata.record_locator["latest"],
200
+ limit=PAGINATION_LIMIT,
201
+ # NOTE: In order to get the exact same range of messages as indexer, it provides
202
+ # timestamps of oldest and newest messages, inclusive=True is necessary to include them
203
+ inclusive=True,
204
+ ):
205
+ messages += conversation_history.get("messages", [])
206
+
207
+ conversation = []
208
+ for message in messages:
209
+ thread_messages = []
210
+ async for conversations_replies in await client.conversations_replies(
211
+ channel=file_data.metadata.record_locator["channel"],
212
+ ts=message["ts"],
213
+ limit=PAGINATION_LIMIT,
214
+ ):
215
+ thread_messages += conversations_replies.get("messages", [])
216
+
217
+ # NOTE: Replies contains the whole thread, including the message references by the `ts`
218
+ # parameter even if it's the only message (there were no replies).
219
+ # Reference: https://api.slack.com/methods/conversations.replies#markdown
220
+ conversation.append(thread_messages)
221
+
222
+ conversation_xml = self._conversation_to_xml(conversation)
223
+ download_path.parent.mkdir(exist_ok=True, parents=True)
224
+ conversation_xml.write(download_path, encoding="utf-8", xml_declaration=True)
225
+
226
+ def _conversation_to_xml(self, conversation: list[list[dict]]) -> ET.ElementTree:
227
+ root = ET.Element("messages")
228
+
229
+ for thread in conversation:
230
+ message, *replies = thread
231
+ message_elem = ET.SubElement(root, "message")
232
+ text_elem = ET.SubElement(message_elem, "text")
233
+ text_elem.text = message.get("text")
234
+
235
+ for reply in replies:
236
+ reply_msg = reply.get("text", "")
237
+ text_elem.text = "".join([str(text_elem.text), " <reply> ", reply_msg])
238
+
239
+ return ET.ElementTree(root)
240
+
241
+
242
+ slack_source_entry = SourceRegistryEntry(
243
+ indexer=SlackIndexer,
244
+ indexer_config=SlackIndexerConfig,
245
+ downloader=SlackDownloader,
246
+ downloader_config=DownloaderConfig,
247
+ connection_config=SlackConnectionConfig,
248
+ )